howard.objects.variants

   1import csv
   2import gc
   3import gzip
   4import io
   5import multiprocessing
   6import os
   7import random
   8import re
   9import shlex
  10import sqlite3
  11import subprocess
  12from tempfile import NamedTemporaryFile, TemporaryDirectory
  13import tempfile
  14import duckdb
  15import json
  16import yaml
  17import argparse
  18import Bio.bgzf as bgzf
  19import pandas as pd
  20from pyfaidx import Fasta
  21import numpy as np
  22import vcf
  23import logging as log
  24import fastparquet as fp
  25from multiprocesspandas import applyparallel
  26
  27from howard.functions.commons import *
  28from howard.objects.database import *
  29from howard.functions.databases import *
  30from howard.functions.utils import *
  31
  32
  33class Variants:
  34
  35    def __init__(
  36        self,
  37        conn=None,
  38        input: str = None,
  39        output: str = None,
  40        config: dict = {},
  41        param: dict = {},
  42        load: bool = False,
  43    ) -> None:
  44        """
  45        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
  46        header
  47
  48        :param conn: the connection to the database
  49        :param input: the input file
  50        :param output: the output file
  51        :param config: a dictionary containing the configuration of the model
  52        :param param: a dictionary containing the parameters of the model
  53        """
  54
  55        # Init variables
  56        self.init_variables()
  57
  58        # Input
  59        self.set_input(input)
  60
  61        # Config
  62        self.set_config(config)
  63
  64        # Param
  65        self.set_param(param)
  66
  67        # Output
  68        self.set_output(output)
  69
  70        # connexion
  71        self.set_connexion(conn)
  72
  73        # Header
  74        self.set_header()
  75
  76        # Load data
  77        if load:
  78            self.load_data()
  79
  80    def set_input(self, input: str = None) -> None:
  81        """
  82        The function takes a file name as input, splits the file name into a name and an extension, and
  83        then sets the input_name, input_extension, and input_format attributes of the class
  84
  85        :param input: The input file
  86        """
  87
  88        if input and not isinstance(input, str):
  89            try:
  90                self.input = input.name
  91            except:
  92                log.error(f"Input file '{input} in bad format")
  93                raise ValueError(f"Input file '{input} in bad format")
  94        else:
  95            self.input = input
  96
  97        # Input format
  98        if input:
  99            input_name, input_extension = os.path.splitext(self.input)
 100            self.input_name = input_name
 101            self.input_extension = input_extension
 102            self.input_format = self.input_extension.replace(".", "")
 103
 104    def set_config(self, config: dict) -> None:
 105        """
 106        This function takes in a config object and sets it as the config object for the class
 107
 108        :param config: The configuration object
 109        """
 110        self.config = config
 111
 112    def set_param(self, param: dict) -> None:
 113        """
 114        This function takes in a param object and sets it as the param object for the class
 115
 116        :param param: The paramters object
 117        """
 118        self.param = param
 119
 120    def init_variables(self) -> None:
 121        """
 122        This function initializes the variables that will be used in the rest of the class
 123        """
 124        self.prefix = "howard"
 125        self.table_variants = "variants"
 126        self.dataframe = None
 127
 128        self.comparison_map = {
 129            "gt": ">",
 130            "gte": ">=",
 131            "lt": "<",
 132            "lte": "<=",
 133            "equals": "=",
 134            "contains": "SIMILAR TO",
 135        }
 136
 137        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
 138
 139        self.code_type_map_to_sql = {
 140            "Integer": "INTEGER",
 141            "String": "VARCHAR",
 142            "Float": "FLOAT",
 143            "Flag": "VARCHAR",
 144        }
 145
 146        self.index_additionnal_fields = []
 147
 148    def get_indexing(self) -> bool:
 149        """
 150        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
 151        returns False.
 152        :return: The value of the indexing parameter.
 153        """
 154        return self.get_param().get("indexing", False)
 155
 156    def get_connexion_config(self) -> dict:
 157        """
 158        The function `get_connexion_config` returns a dictionary containing the configuration for a
 159        connection, including the number of threads and memory limit.
 160        :return: a dictionary containing the configuration for the Connexion library.
 161        """
 162
 163        # config
 164        config = self.get_config()
 165
 166        # Connexion config
 167        connexion_config = {}
 168        threads = self.get_threads()
 169
 170        # Threads
 171        if threads:
 172            connexion_config["threads"] = threads
 173
 174        # Memory
 175        # if config.get("memory", None):
 176        #     connexion_config["memory_limit"] = config.get("memory")
 177        if self.get_memory():
 178            connexion_config["memory_limit"] = self.get_memory()
 179
 180        # Temporary directory
 181        if config.get("tmp", None):
 182            connexion_config["temp_directory"] = config.get("tmp")
 183
 184        # Access
 185        if config.get("access", None):
 186            access = config.get("access")
 187            if access in ["RO"]:
 188                access = "READ_ONLY"
 189            elif access in ["RW"]:
 190                access = "READ_WRITE"
 191            connexion_db = self.get_connexion_db()
 192            if connexion_db in ":memory:":
 193                access = "READ_WRITE"
 194            connexion_config["access_mode"] = access
 195
 196        return connexion_config
 197
 198    def get_duckdb_settings(self) -> dict:
 199        """
 200        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
 201        string.
 202        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
 203        """
 204
 205        # config
 206        config = self.get_config()
 207
 208        # duckdb settings
 209        duckdb_settings_dict = {}
 210        if config.get("duckdb_settings", None):
 211            duckdb_settings = config.get("duckdb_settings")
 212            duckdb_settings = full_path(duckdb_settings)
 213            # duckdb setting is a file
 214            if os.path.exists(duckdb_settings):
 215                with open(duckdb_settings) as json_file:
 216                    duckdb_settings_dict = yaml.safe_load(json_file)
 217            # duckdb settings is a string
 218            else:
 219                duckdb_settings_dict = json.loads(duckdb_settings)
 220
 221        return duckdb_settings_dict
 222
 223    def set_connexion_db(self) -> str:
 224        """
 225        The function `set_connexion_db` returns the appropriate database connection string based on the
 226        input format and connection type.
 227        :return: the value of the variable `connexion_db`.
 228        """
 229
 230        # Default connexion db
 231        default_connexion_db = ":memory:"
 232
 233        # Find connexion db
 234        if self.get_input_format() in ["db", "duckdb"]:
 235            connexion_db = self.get_input()
 236        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
 237            connexion_db = default_connexion_db
 238        elif self.get_connexion_type() in ["tmpfile"]:
 239            tmp_name = tempfile.mkdtemp(
 240                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
 241            )
 242            connexion_db = f"{tmp_name}/tmp.db"
 243        elif self.get_connexion_type() != "":
 244            connexion_db = self.get_connexion_type()
 245        else:
 246            connexion_db = default_connexion_db
 247
 248        # Set connexion db
 249        self.connexion_db = connexion_db
 250
 251        return connexion_db
 252
 253    def set_connexion(self, conn) -> None:
 254        """
 255        It creates a connection to the database
 256
 257        :param conn: The connection to the database. If not provided, a new connection to an in-memory
 258        database is created
 259        """
 260
 261        # Connexion db
 262        connexion_db = self.set_connexion_db()
 263
 264        # Connexion config
 265        connexion_config = self.get_connexion_config()
 266
 267        # Connexion format
 268        connexion_format = self.get_config().get("connexion_format", "duckdb")
 269        # Set connexion format
 270        self.connexion_format = connexion_format
 271
 272        # Connexion
 273        if not conn:
 274            if connexion_format in ["duckdb"]:
 275                conn = duckdb.connect(connexion_db, config=connexion_config)
 276                # duckDB settings
 277                duckdb_settings = self.get_duckdb_settings()
 278                if duckdb_settings:
 279                    for setting in duckdb_settings:
 280                        setting_value = duckdb_settings.get(setting)
 281                        if isinstance(setting_value, str):
 282                            setting_value = f"'{setting_value}'"
 283                        conn.execute(f"PRAGMA {setting}={setting_value};")
 284            elif connexion_format in ["sqlite"]:
 285                conn = sqlite3.connect(connexion_db)
 286
 287        # Set connexion
 288        self.conn = conn
 289
 290        # Log
 291        log.debug(f"connexion_format: {connexion_format}")
 292        log.debug(f"connexion_db: {connexion_db}")
 293        log.debug(f"connexion config: {connexion_config}")
 294        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
 295
 296    def set_output(self, output: str = None) -> None:
 297        """
 298        If the config file has an output key, set the output to the value of that key. Otherwise, set
 299        the output to the input
 300
 301        :param output: The name of the output file
 302        """
 303
 304        if output and not isinstance(output, str):
 305            self.output = output.name
 306        else:
 307            self.output = output
 308
 309        # Output format
 310        if self.output:
 311            output_name, output_extension = os.path.splitext(self.output)
 312            self.output_name = output_name
 313            self.output_extension = output_extension
 314            self.output_format = self.output_extension.replace(".", "")
 315        else:
 316            self.output_name = None
 317            self.output_extension = None
 318            self.output_format = None
 319
 320    def set_header(self) -> None:
 321        """
 322        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
 323        """
 324
 325        input_file = self.get_input()
 326        default_header_list = [
 327            "##fileformat=VCFv4.2",
 328            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
 329        ]
 330
 331        # Full path
 332        input_file = full_path(input_file)
 333
 334        if input_file:
 335
 336            input_format = self.get_input_format()
 337            input_compressed = self.get_input_compressed()
 338            config = self.get_config()
 339            header_list = default_header_list
 340            if input_format in [
 341                "vcf",
 342                "hdr",
 343                "tsv",
 344                "csv",
 345                "psv",
 346                "parquet",
 347                "db",
 348                "duckdb",
 349            ]:
 350                # header provided in param
 351                if config.get("header_file", None):
 352                    with open(config.get("header_file"), "rt") as f:
 353                        header_list = self.read_vcf_header(f)
 354                # within a vcf file format (header within input file itsself)
 355                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
 356                    # within a compressed vcf file format (.vcf.gz)
 357                    if input_compressed:
 358                        with bgzf.open(input_file, "rt") as f:
 359                            header_list = self.read_vcf_header(f)
 360                    # within an uncompressed vcf file format (.vcf)
 361                    else:
 362                        with open(input_file, "rt") as f:
 363                            header_list = self.read_vcf_header(f)
 364                # header provided in default external file .hdr
 365                elif os.path.exists((input_file + ".hdr")):
 366                    with open(input_file + ".hdr", "rt") as f:
 367                        header_list = self.read_vcf_header(f)
 368                else:
 369                    try:  # Try to get header info fields and file columns
 370
 371                        with tempfile.TemporaryDirectory() as tmpdir:
 372
 373                            # Create database
 374                            db_for_header = Database(database=input_file)
 375
 376                            # Get header columns for infos fields
 377                            db_header_from_columns = (
 378                                db_for_header.get_header_from_columns()
 379                            )
 380
 381                            # Get real columns in the file
 382                            db_header_columns = db_for_header.get_columns()
 383
 384                            # Write header file
 385                            header_file_tmp = os.path.join(tmpdir, "header")
 386                            f = open(header_file_tmp, "w")
 387                            vcf.Writer(f, db_header_from_columns)
 388                            f.close()
 389
 390                            # Replace #CHROM line with rel columns
 391                            header_list = db_for_header.read_header_file(
 392                                header_file=header_file_tmp
 393                            )
 394                            header_list[-1] = "\t".join(db_header_columns)
 395
 396                    except:
 397
 398                        log.warning(
 399                            f"No header for file {input_file}. Set as default VCF header"
 400                        )
 401                        header_list = default_header_list
 402
 403            else:  # try for unknown format ?
 404
 405                log.error(f"Input file format '{input_format}' not available")
 406                raise ValueError(f"Input file format '{input_format}' not available")
 407
 408            if not header_list:
 409                header_list = default_header_list
 410
 411            # header as list
 412            self.header_list = header_list
 413
 414            # header as VCF object
 415            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
 416
 417        else:
 418
 419            self.header_list = None
 420            self.header_vcf = None
 421
 422    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
 423        """
 424        > The function `get_query_to_df` takes a query as a string and returns a pandas dataframe
 425
 426        :param query: str = ""
 427        :type query: str
 428        :return: A dataframe
 429        """
 430
 431        # Connexion format
 432        connexion_format = self.get_connexion_format()
 433
 434        # Limit in query
 435        if limit:
 436            pd.set_option("display.max_rows", limit)
 437            if connexion_format in ["duckdb"]:
 438                df = (
 439                    self.conn.execute(query)
 440                    .fetch_record_batch(limit)
 441                    .read_next_batch()
 442                    .to_pandas()
 443                )
 444            elif connexion_format in ["sqlite"]:
 445                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
 446
 447        # Full query
 448        else:
 449            if connexion_format in ["duckdb"]:
 450                df = self.conn.execute(query).df()
 451            elif connexion_format in ["sqlite"]:
 452                df = pd.read_sql_query(query, self.conn)
 453
 454        return df
 455
 456    def get_overview(self) -> None:
 457        """
 458        The function prints the input, output, config, and dataframe of the current object
 459        """
 460        table_variants_from = self.get_table_variants(clause="from")
 461        sql_columns = self.get_header_columns_as_sql()
 462        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
 463        df = self.get_query_to_df(sql_query_export)
 464        log.info(
 465            "Input:  "
 466            + str(self.get_input())
 467            + " ["
 468            + str(str(self.get_input_format()))
 469            + "]"
 470        )
 471        log.info(
 472            "Output: "
 473            + str(self.get_output())
 474            + " ["
 475            + str(str(self.get_output_format()))
 476            + "]"
 477        )
 478        log.info("Config: ")
 479        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
 480            "\n"
 481        ):
 482            log.info("\t" + str(d))
 483        log.info("Param: ")
 484        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
 485            "\n"
 486        ):
 487            log.info("\t" + str(d))
 488        log.info("Sample list: " + str(self.get_header_sample_list()))
 489        log.info("Dataframe: ")
 490        for d in str(df).split("\n"):
 491            log.info("\t" + str(d))
 492
 493        # garbage collector
 494        del df
 495        gc.collect()
 496
 497        return None
 498
 499    def get_stats(self) -> dict:
 500        """
 501        The `get_stats` function calculates and returns various statistics of the current object,
 502        including information about the input file, variants, samples, header fields, quality, and
 503        SNVs/InDels.
 504        :return: a dictionary containing various statistics of the current object. The dictionary has
 505        the following structure:
 506        """
 507
 508        # Log
 509        log.info(f"Stats Calculation...")
 510
 511        # table varaints
 512        table_variants_from = self.get_table_variants()
 513
 514        # stats dict
 515        stats = {"Infos": {}}
 516
 517        ### File
 518        input_file = self.get_input()
 519        stats["Infos"]["Input file"] = input_file
 520
 521        # Header
 522        header_infos = self.get_header().infos
 523        header_formats = self.get_header().formats
 524        header_infos_list = list(header_infos)
 525        header_formats_list = list(header_formats)
 526
 527        ### Variants
 528
 529        stats["Variants"] = {}
 530
 531        # Variants by chr
 532        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
 533        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
 534        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
 535            by=["CHROM"], kind="quicksort"
 536        )
 537
 538        # Total number of variants
 539        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
 540
 541        # Calculate percentage
 542        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
 543            lambda x: (x / nb_of_variants)
 544        )
 545
 546        stats["Variants"]["Number of variants by chromosome"] = (
 547            nb_of_variants_by_chrom.to_dict(orient="index")
 548        )
 549
 550        stats["Infos"]["Number of variants"] = int(nb_of_variants)
 551
 552        ### Samples
 553
 554        # Init
 555        samples = {}
 556        nb_of_samples = 0
 557
 558        # Check Samples
 559        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
 560            log.debug(f"Check samples...")
 561            for sample in self.get_header_sample_list():
 562                sql_query_samples = f"""
 563                    SELECT  '{sample}' as sample,
 564                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
 565                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
 566                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
 567                    FROM {table_variants_from}
 568                    WHERE (
 569                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
 570                        AND
 571                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
 572                      )
 573                    GROUP BY genotype
 574                    """
 575                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
 576                sample_genotype_count = sql_query_genotype_df["count"].sum()
 577                if len(sql_query_genotype_df):
 578                    nb_of_samples += 1
 579                    samples[f"{sample} - {sample_genotype_count} variants"] = (
 580                        sql_query_genotype_df.to_dict(orient="index")
 581                    )
 582
 583            stats["Samples"] = samples
 584            stats["Infos"]["Number of samples"] = nb_of_samples
 585
 586        # #
 587        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
 588        #     stats["Infos"]["Number of samples"] = nb_of_samples
 589        # elif nb_of_samples:
 590        #     stats["Infos"]["Number of samples"] = "not a VCF format"
 591
 592        ### INFO and FORMAT fields
 593        header_types_df = {}
 594        header_types_list = {
 595            "List of INFO fields": header_infos,
 596            "List of FORMAT fields": header_formats,
 597        }
 598        i = 0
 599        for header_type in header_types_list:
 600
 601            header_type_infos = header_types_list.get(header_type)
 602            header_infos_dict = {}
 603
 604            for info in header_type_infos:
 605
 606                i += 1
 607                header_infos_dict[i] = {}
 608
 609                # ID
 610                header_infos_dict[i]["id"] = info
 611
 612                # num
 613                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
 614                if header_type_infos[info].num in genotype_map.keys():
 615                    header_infos_dict[i]["Number"] = genotype_map.get(
 616                        header_type_infos[info].num
 617                    )
 618                else:
 619                    header_infos_dict[i]["Number"] = header_type_infos[info].num
 620
 621                # type
 622                if header_type_infos[info].type:
 623                    header_infos_dict[i]["Type"] = header_type_infos[info].type
 624                else:
 625                    header_infos_dict[i]["Type"] = "."
 626
 627                # desc
 628                if header_type_infos[info].desc != None:
 629                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
 630                else:
 631                    header_infos_dict[i]["Description"] = ""
 632
 633            if len(header_infos_dict):
 634                header_types_df[header_type] = pd.DataFrame.from_dict(
 635                    header_infos_dict, orient="index"
 636                ).to_dict(orient="index")
 637
 638        # Stats
 639        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
 640        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
 641        stats["Header"] = header_types_df
 642
 643        ### QUAL
 644        if "QUAL" in self.get_header_columns():
 645            sql_query_qual = f"""
 646                    SELECT
 647                        avg(CAST(QUAL AS INTEGER)) AS Average,
 648                        min(CAST(QUAL AS INTEGER)) AS Minimum,
 649                        max(CAST(QUAL AS INTEGER)) AS Maximum,
 650                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
 651                        median(CAST(QUAL AS INTEGER)) AS Median,
 652                        variance(CAST(QUAL AS INTEGER)) AS Variance
 653                    FROM {table_variants_from}
 654                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
 655                    """
 656
 657            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
 658            stats["Quality"] = {"Stats": qual}
 659
 660        ### SNV and InDel
 661
 662        sql_query_snv = f"""
 663            
 664            SELECT Type, count FROM (
 665
 666                    SELECT
 667                        'Total' AS Type,
 668                        count(*) AS count
 669                    FROM {table_variants_from}
 670
 671                    UNION
 672
 673                    SELECT
 674                        'MNV' AS Type,
 675                        count(*) AS count
 676                    FROM {table_variants_from}
 677                    WHERE len(REF) > 1 AND len(ALT) > 1
 678                    AND len(REF) = len(ALT)
 679
 680                    UNION
 681
 682                    SELECT
 683                        'InDel' AS Type,
 684                        count(*) AS count
 685                    FROM {table_variants_from}
 686                    WHERE len(REF) > 1 OR len(ALT) > 1
 687                    AND len(REF) != len(ALT)
 688                    
 689                    UNION
 690
 691                    SELECT
 692                        'SNV' AS Type,
 693                        count(*) AS count
 694                    FROM {table_variants_from}
 695                    WHERE len(REF) = 1 AND len(ALT) = 1
 696
 697                )
 698
 699            ORDER BY count DESC
 700
 701                """
 702        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
 703
 704        sql_query_snv_substitution = f"""
 705                SELECT
 706                    concat(REF, '>', ALT) AS 'Substitution',
 707                    count(*) AS count
 708                FROM {table_variants_from}
 709                WHERE len(REF) = 1 AND len(ALT) = 1
 710                GROUP BY REF, ALT
 711                ORDER BY count(*) DESC
 712                """
 713        snv_substitution = (
 714            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
 715        )
 716        stats["Variants"]["Counts"] = snv_indel
 717        stats["Variants"]["Substitutions"] = snv_substitution
 718
 719        return stats
 720
 721    def stats_to_file(self, file: str = None) -> str:
 722        """
 723        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
 724        into a JSON object, and writes the JSON object to the specified file.
 725
 726        :param file: The `file` parameter is a string that represents the file path where the JSON data
 727        will be written
 728        :type file: str
 729        :return: the name of the file that was written to.
 730        """
 731
 732        # Get stats
 733        stats = self.get_stats()
 734
 735        # Serializing json
 736        json_object = json.dumps(stats, indent=4)
 737
 738        # Writing to sample.json
 739        with open(file, "w") as outfile:
 740            outfile.write(json_object)
 741
 742        return file
 743
 744    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
 745        """
 746        The `print_stats` function generates a markdown file and prints the statistics contained in a
 747        JSON file in a formatted manner.
 748
 749        :param output_file: The `output_file` parameter is a string that specifies the path and filename
 750        of the output file where the stats will be printed in Markdown format. If no `output_file` is
 751        provided, a temporary directory will be created and the stats will be saved in a file named
 752        "stats.md" within that
 753        :type output_file: str
 754        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
 755        file where the statistics will be saved. If no value is provided, a temporary directory will be
 756        created and a default file name "stats.json" will be used
 757        :type json_file: str
 758        :return: The function `print_stats` does not return any value. It has a return type annotation
 759        of `None`.
 760        """
 761
 762        # Full path
 763        output_file = full_path(output_file)
 764        json_file = full_path(json_file)
 765
 766        with tempfile.TemporaryDirectory() as tmpdir:
 767
 768            # Files
 769            if not output_file:
 770                output_file = os.path.join(tmpdir, "stats.md")
 771            if not json_file:
 772                json_file = os.path.join(tmpdir, "stats.json")
 773
 774            # Create folders
 775            if not os.path.exists(os.path.dirname(output_file)):
 776                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
 777            if not os.path.exists(os.path.dirname(json_file)):
 778                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
 779
 780            # Create stats JSON file
 781            stats_file = self.stats_to_file(file=json_file)
 782
 783            # Print stats file
 784            with open(stats_file) as f:
 785                stats = yaml.safe_load(f)
 786
 787            # Output
 788            output_title = []
 789            output_index = []
 790            output = []
 791
 792            # Title
 793            output_title.append("# HOWARD Stats")
 794
 795            # Index
 796            output_index.append("## Index")
 797
 798            # Process sections
 799            for section in stats:
 800                infos = stats.get(section)
 801                section_link = "#" + section.lower().replace(" ", "-")
 802                output.append(f"## {section}")
 803                output_index.append(f"- [{section}]({section_link})")
 804
 805                if len(infos):
 806                    for info in infos:
 807                        try:
 808                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
 809                            is_df = True
 810                        except:
 811                            try:
 812                                df = pd.DataFrame.from_dict(
 813                                    json.loads((infos.get(info))), orient="index"
 814                                )
 815                                is_df = True
 816                            except:
 817                                is_df = False
 818                        if is_df:
 819                            output.append(f"### {info}")
 820                            info_link = "#" + info.lower().replace(" ", "-")
 821                            output_index.append(f"   - [{info}]({info_link})")
 822                            output.append(f"{df.to_markdown(index=False)}")
 823                        else:
 824                            output.append(f"- {info}: {infos.get(info)}")
 825                else:
 826                    output.append(f"NA")
 827
 828            # Write stats in markdown file
 829            with open(output_file, "w") as fp:
 830                for item in output_title:
 831                    fp.write("%s\n" % item)
 832                for item in output_index:
 833                    fp.write("%s\n" % item)
 834                for item in output:
 835                    fp.write("%s\n" % item)
 836
 837            # Output stats in markdown
 838            print("")
 839            print("\n\n".join(output_title))
 840            print("")
 841            print("\n\n".join(output))
 842            print("")
 843
 844        return None
 845
 846    def get_input(self) -> str:
 847        """
 848        It returns the value of the input variable.
 849        :return: The input is being returned.
 850        """
 851        return self.input
 852
 853    def get_input_format(self, input_file: str = None) -> str:
 854        """
 855        It returns the format of the input variable.
 856        :return: The format is being returned.
 857        """
 858        if not input_file:
 859            input_file = self.get_input()
 860        input_format = get_file_format(input_file)
 861        return input_format
 862
 863    def get_input_compressed(self, input_file: str = None) -> str:
 864        """
 865        It returns the format of the input variable.
 866        :return: The format is being returned.
 867        """
 868        if not input_file:
 869            input_file = self.get_input()
 870        input_compressed = get_file_compressed(input_file)
 871        return input_compressed
 872
 873    def get_output(self) -> str:
 874        """
 875        It returns the output of the neuron.
 876        :return: The output of the neural network.
 877        """
 878        return self.output
 879
 880    def get_output_format(self, output_file: str = None) -> str:
 881        """
 882        It returns the format of the input variable.
 883        :return: The format is being returned.
 884        """
 885        if not output_file:
 886            output_file = self.get_output()
 887        output_format = get_file_format(output_file)
 888
 889        return output_format
 890
 891    def get_config(self) -> dict:
 892        """
 893        It returns the config
 894        :return: The config variable is being returned.
 895        """
 896        return self.config
 897
 898    def get_param(self) -> dict:
 899        """
 900        It returns the param
 901        :return: The param variable is being returned.
 902        """
 903        return self.param
 904
 905    def get_connexion_db(self) -> str:
 906        """
 907        It returns the connexion_db attribute of the object
 908        :return: The connexion_db is being returned.
 909        """
 910        return self.connexion_db
 911
 912    def get_prefix(self) -> str:
 913        """
 914        It returns the prefix of the object.
 915        :return: The prefix is being returned.
 916        """
 917        return self.prefix
 918
 919    def get_table_variants(self, clause: str = "select") -> str:
 920        """
 921        This function returns the table_variants attribute of the object
 922
 923        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 924        defaults to select (optional)
 925        :return: The table_variants attribute of the object.
 926        """
 927
 928        # Access
 929        access = self.get_config().get("access", None)
 930
 931        # Clauses "select", "where", "update"
 932        if clause in ["select", "where", "update"]:
 933            table_variants = self.table_variants
 934        # Clause "from"
 935        elif clause in ["from"]:
 936            # For Read Only
 937            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 938                input_file = self.get_input()
 939                table_variants = f"'{input_file}' as variants"
 940            # For Read Write
 941            else:
 942                table_variants = f"{self.table_variants} as variants"
 943        else:
 944            table_variants = self.table_variants
 945        return table_variants
 946
 947    def get_tmp_dir(self) -> str:
 948        """
 949        The function `get_tmp_dir` returns the temporary directory path based on configuration
 950        parameters or a default path.
 951        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 952        configuration, parameters, and a default value of "/tmp".
 953        """
 954
 955        return get_tmp(
 956            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 957        )
 958
 959    def get_connexion_type(self) -> str:
 960        """
 961        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 962
 963        :return: The connexion type is being returned.
 964        """
 965        return self.get_config().get("connexion_type", "memory")
 966
 967    def get_connexion(self):
 968        """
 969        It returns the connection object
 970
 971        :return: The connection object.
 972        """
 973        return self.conn
 974
 975    def close_connexion(self) -> None:
 976        """
 977        This function closes the connection to the database.
 978        :return: The connection is being closed.
 979        """
 980        return self.conn.close()
 981
 982    def get_header(self, type: str = "vcf"):
 983        """
 984        This function returns the header of the VCF file as a list of strings
 985
 986        :param type: the type of header you want to get, defaults to vcf (optional)
 987        :return: The header of the vcf file.
 988        """
 989
 990        if self.header_vcf:
 991            if type == "vcf":
 992                return self.header_vcf
 993            elif type == "list":
 994                return self.header_list
 995        else:
 996            if type == "vcf":
 997                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 998                return header
 999            elif type == "list":
1000                return vcf_required
1001
1002    def get_header_length(self, file: str = None) -> int:
1003        """
1004        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1005        line.
1006
1007        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1008        header file. If this argument is provided, the function will read the header from the specified
1009        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1010        :type file: str
1011        :return: the length of the header list, excluding the #CHROM line.
1012        """
1013
1014        if file:
1015            return len(self.read_vcf_header_file(file=file)) - 1
1016        elif self.get_header(type="list"):
1017            return len(self.get_header(type="list")) - 1
1018        else:
1019            return 0
1020
1021    def get_header_columns(self) -> str:
1022        """
1023        This function returns the header list of a VCF
1024
1025        :return: The length of the header list.
1026        """
1027        if self.get_header():
1028            return self.get_header(type="list")[-1]
1029        else:
1030            return ""
1031
1032    def get_header_columns_as_list(self) -> list:
1033        """
1034        This function returns the header list of a VCF
1035
1036        :return: The length of the header list.
1037        """
1038        if self.get_header():
1039            return self.get_header_columns().strip().split("\t")
1040        else:
1041            return []
1042
1043    def get_header_columns_as_sql(self) -> str:
1044        """
1045        This function retruns header length (without #CHROM line)
1046
1047        :return: The length of the header list.
1048        """
1049        sql_column_list = []
1050        for col in self.get_header_columns_as_list():
1051            sql_column_list.append(f'"{col}"')
1052        return ",".join(sql_column_list)
1053
1054    def get_header_sample_list(self) -> list:
1055        """
1056        This function retruns header length (without #CHROM line)
1057
1058        :return: The length of the header list.
1059        """
1060        return self.header_vcf.samples
1061
1062    def get_verbose(self) -> bool:
1063        """
1064        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1065        exist
1066
1067        :return: The value of the key "verbose" in the config dictionary.
1068        """
1069        return self.get_config().get("verbose", False)
1070
1071    def get_connexion_format(self) -> str:
1072        """
1073        It returns the connexion format of the object.
1074        :return: The connexion_format is being returned.
1075        """
1076        connexion_format = self.connexion_format
1077        if connexion_format not in ["duckdb", "sqlite"]:
1078            log.error(f"Unknown connexion format {connexion_format}")
1079            raise ValueError(f"Unknown connexion format {connexion_format}")
1080        else:
1081            return connexion_format
1082
1083    def insert_file_to_table(
1084        self,
1085        file,
1086        columns: str,
1087        header_len: int = 0,
1088        sep: str = "\t",
1089        chunksize: int = 1000000,
1090    ) -> None:
1091        """
1092        The function reads a file in chunks, and inserts each chunk into a table
1093
1094        :param file: the file to be loaded
1095        :param columns: a string of the column names separated by commas
1096        :param header_len: the number of lines to skip at the beginning of the file, defaults to 0
1097        (optional)
1098        :param sep: the separator used in the file, defaults to \t (optional)
1099        :param chunksize: The number of rows to read in at a time, defaults to 1000000 (optional)
1100        """
1101
1102        # Config
1103        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1104        connexion_format = self.get_connexion_format()
1105
1106        log.debug("chunksize: " + str(chunksize))
1107
1108        if chunksize:
1109            for chunk in pd.read_csv(
1110                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1111            ):
1112                if connexion_format in ["duckdb"]:
1113                    sql_insert_into = (
1114                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1115                    )
1116                    self.conn.execute(sql_insert_into)
1117                elif connexion_format in ["sqlite"]:
1118                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
1119
1120    def load_data(
1121        self,
1122        input_file: str = None,
1123        drop_variants_table: bool = False,
1124        sample_size: int = 20480,
1125    ) -> None:
1126        """
1127        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1128        table before loading the data and specify a sample size.
1129
1130        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1131        table
1132        :type input_file: str
1133        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1134        determines whether the variants table should be dropped before loading the data. If set to
1135        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1136        not be dropped, defaults to False
1137        :type drop_variants_table: bool (optional)
1138        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1139        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1140        20480
1141        :type sample_size: int (optional)
1142        """
1143
1144        log.info("Loading...")
1145
1146        # change input file
1147        if input_file:
1148            self.set_input(input_file)
1149            self.set_header()
1150
1151        # drop variants table
1152        if drop_variants_table:
1153            self.drop_variants_table()
1154
1155        # get table variants
1156        table_variants = self.get_table_variants()
1157
1158        # Access
1159        access = self.get_config().get("access", None)
1160        log.debug(f"access: {access}")
1161
1162        # Input format and compress
1163        input_format = self.get_input_format()
1164        input_compressed = self.get_input_compressed()
1165        log.debug(f"input_format: {input_format}")
1166        log.debug(f"input_compressed: {input_compressed}")
1167
1168        # input_compressed_format
1169        if input_compressed:
1170            input_compressed_format = "gzip"
1171        else:
1172            input_compressed_format = "none"
1173        log.debug(f"input_compressed_format: {input_compressed_format}")
1174
1175        # Connexion format
1176        connexion_format = self.get_connexion_format()
1177
1178        # Sample size
1179        if not sample_size:
1180            sample_size = -1
1181        log.debug(f"sample_size: {sample_size}")
1182
1183        # Load data
1184        log.debug(f"Load Data from {input_format}")
1185
1186        # DuckDB connexion
1187        if connexion_format in ["duckdb"]:
1188
1189            # Database already exists
1190            if self.input_format in ["db", "duckdb"]:
1191
1192                if connexion_format in ["duckdb"]:
1193                    log.debug(f"Input file format '{self.input_format}' duckDB")
1194                else:
1195                    log.error(
1196                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1197                    )
1198                    raise ValueError(
1199                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1200                    )
1201
1202            # Load from existing database format
1203            else:
1204
1205                try:
1206                    # Create Table or View
1207                    database = Database(database=self.input)
1208                    sql_from = database.get_sql_from(sample_size=sample_size)
1209
1210                    if access in ["RO"]:
1211                        sql_load = (
1212                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1213                        )
1214                    else:
1215                        sql_load = (
1216                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1217                        )
1218                    self.conn.execute(sql_load)
1219
1220                except:
1221                    # Format not available
1222                    log.error(f"Input file format '{self.input_format}' not available")
1223                    raise ValueError(
1224                        f"Input file format '{self.input_format}' not available"
1225                    )
1226
1227        # SQLite connexion
1228        elif connexion_format in ["sqlite"] and input_format in [
1229            "vcf",
1230            "tsv",
1231            "csv",
1232            "psv",
1233        ]:
1234
1235            # Main structure
1236            structure = {
1237                "#CHROM": "VARCHAR",
1238                "POS": "INTEGER",
1239                "ID": "VARCHAR",
1240                "REF": "VARCHAR",
1241                "ALT": "VARCHAR",
1242                "QUAL": "VARCHAR",
1243                "FILTER": "VARCHAR",
1244                "INFO": "VARCHAR",
1245            }
1246
1247            # Strcuture with samples
1248            structure_complete = structure
1249            if self.get_header_sample_list():
1250                structure["FORMAT"] = "VARCHAR"
1251                for sample in self.get_header_sample_list():
1252                    structure_complete[sample] = "VARCHAR"
1253
1254            # Columns list for create and insert
1255            sql_create_table_columns = []
1256            sql_create_table_columns_list = []
1257            for column in structure_complete:
1258                column_type = structure_complete[column]
1259                sql_create_table_columns.append(
1260                    f'"{column}" {column_type} default NULL'
1261                )
1262                sql_create_table_columns_list.append(f'"{column}"')
1263
1264            # Create database
1265            log.debug(f"Create Table {table_variants}")
1266            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1267            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1268            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1269            self.conn.execute(sql_create_table)
1270
1271            # chunksize define length of file chunk load file
1272            chunksize = 100000
1273
1274            # delimiter
1275            delimiter = file_format_delimiters.get(input_format, "\t")
1276
1277            # Load the input file
1278            with open(self.input, "rt") as input_file:
1279
1280                # Use the appropriate file handler based on the input format
1281                if input_compressed:
1282                    input_file = bgzf.open(self.input, "rt")
1283                if input_format in ["vcf"]:
1284                    header_len = self.get_header_length()
1285                else:
1286                    header_len = 0
1287
1288                # Insert the file contents into a table
1289                self.insert_file_to_table(
1290                    input_file,
1291                    columns=sql_create_table_columns_list_sql,
1292                    header_len=header_len,
1293                    sep=delimiter,
1294                    chunksize=chunksize,
1295                )
1296
1297        else:
1298            log.error(
1299                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1300            )
1301            raise ValueError(
1302                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1303            )
1304
1305        # Explode INFOS fields into table fields
1306        if self.get_explode_infos():
1307            self.explode_infos(
1308                prefix=self.get_explode_infos_prefix(),
1309                fields=self.get_explode_infos_fields(),
1310                force=True,
1311            )
1312
1313        # Create index after insertion
1314        self.create_indexes()
1315
1316    def get_explode_infos(self) -> bool:
1317        """
1318        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1319        to False if it is not set.
1320        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1321        value. If the parameter is not present, it will return False.
1322        """
1323
1324        return self.get_param().get("explode", {}).get("explode_infos", False)
1325
1326    def get_explode_infos_fields(
1327        self,
1328        explode_infos_fields: str = None,
1329        remove_fields_not_in_header: bool = False,
1330    ) -> list:
1331        """
1332        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1333        the input parameter `explode_infos_fields`.
1334
1335        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1336        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1337        comma-separated list of field names to explode
1338        :type explode_infos_fields: str
1339        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1340        flag that determines whether to remove fields that are not present in the header. If it is set
1341        to `True`, any field that is not in the header will be excluded from the list of exploded
1342        information fields. If it is set to `, defaults to False
1343        :type remove_fields_not_in_header: bool (optional)
1344        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1345        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1346        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1347        Otherwise, it returns a list of exploded information fields after removing any spaces and
1348        splitting the string by commas.
1349        """
1350
1351        # If no fields, get it in param
1352        if not explode_infos_fields:
1353            explode_infos_fields = (
1354                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1355            )
1356
1357        # If no fields, defined as all fields in header using keyword
1358        if not explode_infos_fields:
1359            explode_infos_fields = "*"
1360
1361        # If fields list not empty
1362        if explode_infos_fields:
1363
1364            # Input fields list
1365            if isinstance(explode_infos_fields, str):
1366                fields_input = explode_infos_fields.split(",")
1367            elif isinstance(explode_infos_fields, list):
1368                fields_input = explode_infos_fields
1369            else:
1370                fields_input = []
1371
1372            # Fields list without * keyword
1373            fields_without_all = fields_input.copy()
1374            if "*".casefold() in (item.casefold() for item in fields_without_all):
1375                fields_without_all.remove("*")
1376
1377            # Fields in header
1378            fields_in_header = sorted(list(set(self.get_header().infos)))
1379
1380            # Construct list of fields
1381            fields_output = []
1382            for field in fields_input:
1383
1384                # Strip field
1385                field = field.strip()
1386
1387                # format keyword * in regex
1388                if field.upper() in ["*"]:
1389                    field = ".*"
1390
1391                # Find all fields with pattern
1392                r = re.compile(field)
1393                fields_search = sorted(list(filter(r.match, fields_in_header)))
1394
1395                # Remove fields input from search
1396                if fields_search != [field]:
1397                    fields_search = sorted(
1398                        list(set(fields_search).difference(fields_input))
1399                    )
1400
1401                # If field is not in header (avoid not well formatted header)
1402                if not fields_search and not remove_fields_not_in_header:
1403                    fields_search = [field]
1404
1405                # Add found fields
1406                for new_field in fields_search:
1407                    # Add field, if not already exists, and if it is in header (if asked)
1408                    if (
1409                        new_field not in fields_output
1410                        and (
1411                            not remove_fields_not_in_header
1412                            or new_field in fields_in_header
1413                        )
1414                        and new_field not in [".*"]
1415                    ):
1416                        fields_output.append(new_field)
1417
1418            return fields_output
1419
1420        else:
1421
1422            return []
1423
1424    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1425        """
1426        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1427        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1428        not provided.
1429
1430        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1431        prefix to be used for exploding or expanding information
1432        :type explode_infos_prefix: str
1433        :return: the value of the variable `explode_infos_prefix`.
1434        """
1435
1436        if not explode_infos_prefix:
1437            explode_infos_prefix = (
1438                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1439            )
1440
1441        return explode_infos_prefix
1442
1443    def add_column(
1444        self,
1445        table_name,
1446        column_name,
1447        column_type,
1448        default_value=None,
1449        drop: bool = False,
1450    ) -> dict:
1451        """
1452        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1453        doesn't already exist.
1454
1455        :param table_name: The name of the table to which you want to add a column
1456        :param column_name: The parameter "column_name" is the name of the column that you want to add
1457        to the table
1458        :param column_type: The `column_type` parameter specifies the data type of the column that you
1459        want to add to the table. It should be a string that represents the desired data type, such as
1460        "INTEGER", "TEXT", "REAL", etc
1461        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1462        default value for the newly added column. If a default value is provided, it will be assigned to
1463        the column for any existing rows that do not have a value for that column
1464        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1465        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1466        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1467        to False
1468        :type drop: bool (optional)
1469        :return: a boolean value indicating whether the column was successfully added to the table.
1470        """
1471
1472        # added
1473        added = False
1474        dropped = False
1475
1476        # Check if the column already exists in the table
1477        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1478        columns = self.get_query_to_df(query).columns.tolist()
1479        if column_name in columns:
1480            log.debug(
1481                f"The {column_name} column already exists in the {table_name} table"
1482            )
1483            if drop:
1484                self.drop_column(table_name=table_name, column_name=column_name)
1485                dropped = True
1486            else:
1487                return None
1488        else:
1489            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1490
1491        # Add column in table
1492        add_column_query = (
1493            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1494        )
1495        if default_value is not None:
1496            add_column_query += f" DEFAULT {default_value}"
1497        self.execute_query(add_column_query)
1498        added = not dropped
1499        log.debug(
1500            f"The {column_name} column was successfully added to the {table_name} table"
1501        )
1502
1503        if added:
1504            added_column = {
1505                "table_name": table_name,
1506                "column_name": column_name,
1507                "column_type": column_type,
1508                "default_value": default_value,
1509            }
1510        else:
1511            added_column = None
1512
1513        return added_column
1514
1515    def drop_column(
1516        self, column: dict = None, table_name: str = None, column_name: str = None
1517    ) -> bool:
1518        """
1519        The `drop_column` function drops a specified column from a given table in a database and returns
1520        True if the column was successfully dropped, and False if the column does not exist in the
1521        table.
1522
1523        :param column: The `column` parameter is a dictionary that contains information about the column
1524        you want to drop. It has two keys:
1525        :type column: dict
1526        :param table_name: The `table_name` parameter is the name of the table from which you want to
1527        drop a column
1528        :type table_name: str
1529        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1530        from the table
1531        :type column_name: str
1532        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1533        and False if the column does not exist in the table.
1534        """
1535
1536        # Find column infos
1537        if column:
1538            if isinstance(column, dict):
1539                table_name = column.get("table_name", None)
1540                column_name = column.get("column_name", None)
1541            elif isinstance(column, str):
1542                table_name = self.get_table_variants()
1543                column_name = column
1544            else:
1545                table_name = None
1546                column_name = None
1547
1548        if not table_name and not column_name:
1549            return False
1550
1551        # Removed
1552        removed = False
1553
1554        # Check if the column already exists in the table
1555        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1556        columns = self.get_query_to_df(query).columns.tolist()
1557        if column_name in columns:
1558            log.debug(f"The {column_name} column exists in the {table_name} table")
1559        else:
1560            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1561            return False
1562
1563        # Add column in table # ALTER TABLE integers DROP k
1564        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1565        self.execute_query(add_column_query)
1566        removed = True
1567        log.debug(
1568            f"The {column_name} column was successfully dropped to the {table_name} table"
1569        )
1570
1571        return removed
1572
1573    def explode_infos(
1574        self,
1575        prefix: str = None,
1576        create_index: bool = False,
1577        fields: list = None,
1578        force: bool = False,
1579        proccess_all_fields_together: bool = False,
1580    ) -> list:
1581        """
1582        The `explode_infos` function takes a VCF file and explodes the INFO fields into individual
1583        columns, returning a list of added columns.
1584
1585        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1586        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1587        `self.get_explode_infos_prefix()` as the prefix
1588        :type prefix: str
1589        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1590        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1591        `False`, indexes will not be created. The default value is `False`, defaults to False
1592        :type create_index: bool (optional)
1593        :param fields: The `fields` parameter is a list of INFO fields that you want to explode into
1594        individual columns. If this parameter is not provided, all INFO fields will be exploded
1595        :type fields: list
1596        :param force: The `force` parameter is a boolean flag that determines whether to drop and
1597        recreate the column if it already exists in the table. If `force` is set to `True`, the column
1598        will be dropped and recreated. If `force` is set to `False`, the column will not be dropped,
1599        defaults to False
1600        :type force: bool (optional)
1601        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1602        flag that determines whether to process all the INFO fields together or individually. If set to
1603        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1604        be processed individually, defaults to False
1605        :type proccess_all_fields_together: bool (optional)
1606        :return: The function `explode_infos` returns a list of added columns.
1607        """
1608
1609        # drop indexes
1610        self.drop_indexes()
1611
1612        # connexion format
1613        connexion_format = self.get_connexion_format()
1614
1615        # Access
1616        access = self.get_config().get("access", None)
1617
1618        # Added columns
1619        added_columns = []
1620
1621        if access not in ["RO"]:
1622
1623            # prefix
1624            if prefix in [None, True] or not isinstance(prefix, str):
1625                if self.get_explode_infos_prefix() not in [None, True]:
1626                    prefix = self.get_explode_infos_prefix()
1627                else:
1628                    prefix = "INFO/"
1629
1630            # table variants
1631            table_variants = self.get_table_variants(clause="select")
1632
1633            # extra infos
1634            try:
1635                extra_infos = self.get_extra_infos()
1636            except:
1637                extra_infos = []
1638
1639            # Header infos
1640            header_infos = self.get_header().infos
1641
1642            log.debug(
1643                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1644            )
1645
1646            sql_info_alter_table_array = []
1647
1648            # Info fields to check
1649            fields_list = list(header_infos)
1650            if fields:
1651                fields_list += fields
1652            fields_list = set(fields_list)
1653
1654            # If no fields
1655            if not fields:
1656                fields = []
1657
1658            # Translate fields if patterns
1659            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1660
1661            for info in fields:
1662
1663                info_id_sql = prefix + info
1664
1665                if (
1666                    info in fields_list
1667                    or prefix + info in fields_list
1668                    or info in extra_infos
1669                ):
1670
1671                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1672
1673                    if info in header_infos:
1674                        info_type = header_infos[info].type
1675                        info_num = header_infos[info].num
1676                    else:
1677                        info_type = "String"
1678                        info_num = 0
1679
1680                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1681                    if info_num != 1:
1682                        type_sql = "VARCHAR"
1683
1684                    # Add field
1685                    added_column = self.add_column(
1686                        table_name=table_variants,
1687                        column_name=info_id_sql,
1688                        column_type=type_sql,
1689                        default_value="null",
1690                        drop=force,
1691                    )
1692
1693                    if added_column:
1694                        added_columns.append(added_column)
1695
1696                    if added_column or force:
1697
1698                        # add field to index
1699                        self.index_additionnal_fields.append(info_id_sql)
1700
1701                        # Update field array
1702                        if connexion_format in ["duckdb"]:
1703                            update_info_field = f"""
1704                            "{info_id_sql}" =
1705                                CASE
1706                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1707                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1708                                END
1709                            """
1710                        elif connexion_format in ["sqlite"]:
1711                            update_info_field = f"""
1712                                "{info_id_sql}" =
1713                                    CASE
1714                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1715                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1716                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1717                                    END
1718                            """
1719
1720                        sql_info_alter_table_array.append(update_info_field)
1721
1722            if sql_info_alter_table_array:
1723
1724                # By chromosomes
1725                try:
1726                    chromosomes_list = list(
1727                        self.get_query_to_df(
1728                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1729                        )["#CHROM"]
1730                    )
1731                except:
1732                    chromosomes_list = [None]
1733
1734                for chrom in chromosomes_list:
1735                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1736
1737                    # Where clause
1738                    where_clause = ""
1739                    if chrom and len(chromosomes_list) > 1:
1740                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1741
1742                    # Update table
1743                    if proccess_all_fields_together:
1744                        sql_info_alter_table_array_join = ", ".join(
1745                            sql_info_alter_table_array
1746                        )
1747                        if sql_info_alter_table_array_join:
1748                            sql_info_alter_table = f"""
1749                                UPDATE {table_variants}
1750                                SET {sql_info_alter_table_array_join}
1751                                {where_clause}
1752                                """
1753                            log.debug(
1754                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1755                            )
1756                            # log.debug(sql_info_alter_table)
1757                            self.conn.execute(sql_info_alter_table)
1758                    else:
1759                        sql_info_alter_num = 0
1760                        for sql_info_alter in sql_info_alter_table_array:
1761                            sql_info_alter_num += 1
1762                            sql_info_alter_table = f"""
1763                                UPDATE {table_variants}
1764                                SET {sql_info_alter}
1765                                {where_clause}
1766                                """
1767                            log.debug(
1768                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1769                            )
1770                            # log.debug(sql_info_alter_table)
1771                            self.conn.execute(sql_info_alter_table)
1772
1773        # create indexes
1774        if create_index:
1775            self.create_indexes()
1776
1777        return added_columns
1778
1779    def create_indexes(self) -> None:
1780        """
1781        Create indexes on the table after insertion
1782        """
1783
1784        # Access
1785        access = self.get_config().get("access", None)
1786
1787        # get table variants
1788        table_variants = self.get_table_variants("FROM")
1789
1790        if self.get_indexing() and access not in ["RO"]:
1791            # Create index
1792            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1793            self.conn.execute(sql_create_table_index)
1794            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1795            self.conn.execute(sql_create_table_index)
1796            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1797            self.conn.execute(sql_create_table_index)
1798            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1799            self.conn.execute(sql_create_table_index)
1800            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1801            self.conn.execute(sql_create_table_index)
1802            for field in self.index_additionnal_fields:
1803                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1804                self.conn.execute(sql_create_table_index)
1805
1806    def drop_indexes(self) -> None:
1807        """
1808        Create indexes on the table after insertion
1809        """
1810
1811        # Access
1812        access = self.get_config().get("access", None)
1813
1814        # get table variants
1815        table_variants = self.get_table_variants("FROM")
1816
1817        # Get database format
1818        connexion_format = self.get_connexion_format()
1819
1820        if access not in ["RO"]:
1821            if connexion_format in ["duckdb"]:
1822                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1823            elif connexion_format in ["sqlite"]:
1824                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1825
1826            list_indexes = self.conn.execute(sql_list_indexes)
1827            index_names = [row[0] for row in list_indexes.fetchall()]
1828            for index in index_names:
1829                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1830                self.conn.execute(sql_drop_table_index)
1831
1832    def read_vcf_header(self, f) -> list:
1833        """
1834        It reads the header of a VCF file and returns a list of the header lines
1835
1836        :param f: the file object
1837        :return: The header lines of the VCF file.
1838        """
1839
1840        header_list = []
1841        for line in f:
1842            header_list.append(line)
1843            if line.startswith("#CHROM"):
1844                break
1845        return header_list
1846
1847    def read_vcf_header_file(self, file: str = None) -> list:
1848        """
1849        The function `read_vcf_header_file` reads the header of a VCF file, either from a compressed or
1850        uncompressed file.
1851
1852        :param file: The `file` parameter is a string that represents the path to the VCF header file
1853        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1854        default to `None`
1855        :type file: str
1856        :param compressed: The `compressed` parameter is a boolean flag that indicates whether the VCF
1857        file is compressed or not. If `compressed` is set to `True`, it means that the VCF file is
1858        compressed using the BGZF compression format. If `compressed` is set to `False`, it means that,
1859        defaults to False
1860        :type compressed: bool (optional)
1861        :return: a list.
1862        """
1863
1864        if self.get_input_compressed(input_file=file):
1865            with bgzf.open(file, "rt") as f:
1866                return self.read_vcf_header(f=f)
1867        else:
1868            with open(file, "rt") as f:
1869                return self.read_vcf_header(f=f)
1870
1871    def execute_query(self, query: str):
1872        """
1873        It takes a query as an argument, executes it, and returns the results
1874
1875        :param query: The query to be executed
1876        :return: The result of the query is being returned.
1877        """
1878        if query:
1879            return self.conn.execute(query)  # .fetchall()
1880        else:
1881            return None
1882
1883    def export_output(
1884        self,
1885        output_file: str | None = None,
1886        output_header: str | None = None,
1887        export_header: bool = True,
1888        query: str | None = None,
1889        parquet_partitions: list | None = None,
1890        chunk_size: int | None = None,
1891        threads: int | None = None,
1892        sort: bool = False,
1893        index: bool = False,
1894        order_by: str | None = None,
1895    ) -> bool:
1896        """
1897        The `export_output` function exports data from a VCF file to a specified output file in various
1898        formats, including VCF, CSV, TSV, PSV, and Parquet.
1899
1900        :param output_file: The `output_file` parameter is a string that specifies the name of the
1901        output file to be generated by the function. This is where the exported data will be saved
1902        :type output_file: str
1903        :param output_header: The `output_header` parameter is a string that specifies the name of the
1904        file where the header of the VCF file will be exported. If this parameter is not provided, the
1905        header will be exported to a file with the same name as the `output_file` parameter, but with
1906        the extension "
1907        :type output_header: str
1908        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1909        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1910        True, the header will be exported to a file. If `export_header` is False, the header will not
1911        be, defaults to True, if output format is not VCF
1912        :type export_header: bool (optional)
1913        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1914        select specific data from the VCF file before exporting it. If provided, only the data that
1915        matches the query will be exported
1916        :type query: str
1917        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1918        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1919        organize data in a hierarchical directory structure based on the values of one or more columns.
1920        This can improve query performance when working with large datasets
1921        :type parquet_partitions: list
1922        :param chunk_size: The `chunk_size` parameter specifies the number of
1923        records in batch when exporting data in Parquet format. This parameter is used for
1924        partitioning the Parquet file into multiple files.
1925        :type chunk_size: int
1926        :param threads: The `threads` parameter is an optional parameter that specifies the number of
1927        threads to be used during the export process. It determines the level of parallelism and can
1928        improve the performance of the export operation. If not provided, the function will use the
1929        default number of threads
1930        :type threads: int
1931        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
1932        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
1933        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
1934        False
1935        :type sort: bool (optional)
1936        :param index: The `index` parameter is a boolean flag that determines whether an index should be
1937        created on the output file. If `index` is True, an index will be created. If `index` is False,
1938        no index will be created. The default value is False, defaults to False
1939        :type index: bool (optional)
1940        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
1941        sorting the output file. This parameter is only applicable when exporting data in VCF format
1942        :type order_by: str
1943        :return: a boolean value. It checks if the output file exists and returns True if it does, or
1944        None if it doesn't.
1945        """
1946
1947        # Log
1948        log.info("Exporting...")
1949
1950        # Full path
1951        output_file = full_path(output_file)
1952        output_header = full_path(output_header)
1953
1954        # Config
1955        config = self.get_config()
1956
1957        # Param
1958        param = self.get_param()
1959
1960        # Tmp files to remove
1961        tmp_to_remove = []
1962
1963        # If no output, get it
1964        if not output_file:
1965            output_file = self.get_output()
1966
1967        # If not threads
1968        if not threads:
1969            threads = self.get_threads()
1970
1971        # Auto header name with extension
1972        if export_header or output_header:
1973            if not output_header:
1974                output_header = f"{output_file}.hdr"
1975            # Export header
1976            self.export_header(output_file=output_file)
1977
1978        # Switch off export header if VCF output
1979        output_file_type = get_file_format(output_file)
1980        if output_file_type in ["vcf"]:
1981            export_header = False
1982            tmp_to_remove.append(output_header)
1983
1984        # Chunk size
1985        if not chunk_size:
1986            chunk_size = config.get("chunk_size", None)
1987
1988        # Parquet partition
1989        if not parquet_partitions:
1990            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
1991        if parquet_partitions and isinstance(parquet_partitions, str):
1992            parquet_partitions = parquet_partitions.split(",")
1993
1994        # Order by
1995        if not order_by:
1996            order_by = param.get("export", {}).get("order_by", "")
1997
1998        # Header in output
1999        header_in_output = param.get("export", {}).get("include_header", False)
2000
2001        # Database
2002        database_source = self.get_connexion()
2003
2004        # Connexion format
2005        connexion_format = self.get_connexion_format()
2006
2007        # Explode infos
2008        if self.get_explode_infos():
2009            self.explode_infos(
2010                prefix=self.get_explode_infos_prefix(),
2011                fields=self.get_explode_infos_fields(),
2012                force=False,
2013            )
2014
2015        # if connexion_format in ["sqlite"] or query:
2016        if connexion_format in ["sqlite"]:
2017
2018            # Export in Parquet
2019            random_tmp = "".join(
2020                random.choice(string.ascii_lowercase) for i in range(10)
2021            )
2022            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2023            tmp_to_remove.append(database_source)
2024
2025            # Table Variants
2026            table_variants = self.get_table_variants()
2027
2028            # Create export query
2029            sql_query_export_subquery = f"""
2030                SELECT * FROM {table_variants}
2031                """
2032
2033            # Write source file
2034            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2035
2036        # Create database
2037        database = Database(
2038            database=database_source,
2039            table="variants",
2040            header_file=output_header,
2041            conn_config=self.get_connexion_config(),
2042        )
2043
2044        # Existing colomns header
2045        # existing_columns_header = database.get_header_file_columns(output_header)
2046        existing_columns_header = database.get_header_columns_from_database()
2047
2048        # Export file
2049        database.export(
2050            output_database=output_file,
2051            output_header=output_header,
2052            existing_columns_header=existing_columns_header,
2053            parquet_partitions=parquet_partitions,
2054            chunk_size=chunk_size,
2055            threads=threads,
2056            sort=sort,
2057            index=index,
2058            header_in_output=header_in_output,
2059            order_by=order_by,
2060            query=query,
2061            export_header=export_header,
2062        )
2063
2064        # Remove
2065        remove_if_exists(tmp_to_remove)
2066
2067        return (os.path.exists(output_file) or None) and (
2068            os.path.exists(output_file) or None
2069        )
2070
2071    def get_extra_infos(self, table: str = None) -> list:
2072        """
2073        > This function returns a list of columns that are in the table but not in the header
2074
2075        The function is called `get_extra_infos` and it takes two arguments: `self` and `table`. The
2076        `self` argument is a reference to the object that called the function. The `table` argument is
2077        the name of the table that we want to get the extra columns from
2078
2079        :param table: The table to get the extra columns from. If not specified, it will use the
2080        variants table
2081        :param format: The format of the output. If it's "sql", it will return a string of the extra
2082        columns separated by commas. If it's "list", it will return a list of the extra columns
2083        :return: A list of columns that are in the table but not in the header
2084        """
2085
2086        header_columns = []
2087
2088        if not table:
2089            table = self.get_table_variants(clause="from")
2090            header_columns = self.get_header_columns()
2091
2092        # Check all columns in the database
2093        query = f""" SELECT * FROM {table} LIMIT 1 """
2094        log.debug(f"query {query}")
2095        table_columns = self.get_query_to_df(query).columns.tolist()
2096        extra_columns = []
2097
2098        # Construct extra infos (not in header)
2099        for column in table_columns:
2100            if column not in header_columns:
2101                extra_columns.append(column)
2102
2103        return extra_columns
2104
2105    def get_extra_infos_sql(self, table: str = None) -> str:
2106        """
2107        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2108        by double quotes
2109
2110        :param table: The name of the table to get the extra infos from. If None, the default table is
2111        used
2112        :type table: str
2113        :return: A string of the extra infos
2114        """
2115
2116        return ", ".join(
2117            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2118        )
2119
2120    def export_header(
2121        self,
2122        header_name: str = None,
2123        output_file: str = None,
2124        output_file_ext: str = ".hdr",
2125        clean_header: bool = True,
2126        remove_chrom_line: bool = False,
2127    ) -> str:
2128        """
2129        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2130        specified options, and writes it to a new file.
2131
2132        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2133        this parameter is not specified, the header will be written to the output file
2134        :type header_name: str
2135        :param output_file: The `output_file` parameter in the `export_header` function is used to
2136        specify the name of the output file where the header will be written. If this parameter is not
2137        provided, the header will be written to a temporary file
2138        :type output_file: str
2139        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2140        string that represents the extension of the output header file. By default, it is set to ".hdr"
2141        if not specified by the user. This extension will be appended to the `output_file` name to
2142        create the final, defaults to .hdr
2143        :type output_file_ext: str (optional)
2144        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2145        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2146        `True`, the function will clean the header by modifying certain lines based on a specific
2147        pattern. If `clean_header`, defaults to True
2148        :type clean_header: bool (optional)
2149        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2150        boolean flag that determines whether the #CHROM line should be removed from the header before
2151        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2152        defaults to False
2153        :type remove_chrom_line: bool (optional)
2154        :return: The function `export_header` returns the name of the temporary header file that is
2155        created.
2156        """
2157
2158        if not header_name and not output_file:
2159            output_file = self.get_output()
2160
2161        if self.get_header():
2162
2163            # Get header object
2164            header_obj = self.get_header()
2165
2166            # Create database
2167            db_for_header = Database(database=self.get_input())
2168
2169            # Get real columns in the file
2170            db_header_columns = db_for_header.get_columns()
2171
2172            with tempfile.TemporaryDirectory() as tmpdir:
2173
2174                # Write header file
2175                header_file_tmp = os.path.join(tmpdir, "header")
2176                f = open(header_file_tmp, "w")
2177                vcf.Writer(f, header_obj)
2178                f.close()
2179
2180                # Replace #CHROM line with rel columns
2181                header_list = db_for_header.read_header_file(
2182                    header_file=header_file_tmp
2183                )
2184                header_list[-1] = "\t".join(db_header_columns)
2185
2186                # Remove CHROM line
2187                if remove_chrom_line:
2188                    header_list.pop()
2189
2190                # Clean header
2191                if clean_header:
2192                    header_list_clean = []
2193                    for head in header_list:
2194                        # Clean head for malformed header
2195                        head_clean = head
2196                        head_clean = re.subn(
2197                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2198                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2199                            head_clean,
2200                            2,
2201                        )[0]
2202                        # Write header
2203                        header_list_clean.append(head_clean)
2204                    header_list = header_list_clean
2205
2206            tmp_header_name = output_file + output_file_ext
2207
2208            f = open(tmp_header_name, "w")
2209            for line in header_list:
2210                f.write(line)
2211            f.close()
2212
2213        return tmp_header_name
2214
2215    def export_variant_vcf(
2216        self,
2217        vcf_file,
2218        remove_info: bool = False,
2219        add_samples: bool = True,
2220        list_samples: list = [],
2221        index: bool = False,
2222        threads: int | None = None,
2223    ) -> bool | None:
2224        """
2225        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2226        remove INFO field, add samples, and control compression and indexing.
2227
2228        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2229        written to. It is the output file that will contain the filtered VCF data based on the specified
2230        parameters
2231        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2232        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2233        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2234        in, defaults to False
2235        :type remove_info: bool (optional)
2236        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2237        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2238        If set to False, the samples will be removed. The default value is True, defaults to True
2239        :type add_samples: bool (optional)
2240        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2241        in the output VCF file. By default, all samples will be included. If you provide a list of
2242        samples, only those samples will be included in the output file
2243        :type list_samples: list
2244        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2245        determines whether or not to create an index for the output VCF file. If `index` is set to
2246        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2247        :type index: bool (optional)
2248        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2249        number of threads to use for exporting the VCF file. It determines how many parallel threads
2250        will be used during the export process. More threads can potentially speed up the export process
2251        by utilizing multiple cores of the processor. If
2252        :type threads: int | None
2253        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2254        method with various parameters including the output file, query, threads, sort flag, and index
2255        flag. The `export_output` method is responsible for exporting the VCF data based on the
2256        specified parameters and configurations provided in the `export_variant_vcf` function.
2257        """
2258
2259        # Config
2260        config = self.get_config()
2261
2262        # Extract VCF
2263        log.debug("Export VCF...")
2264
2265        # Table variants
2266        table_variants = self.get_table_variants()
2267
2268        # Threads
2269        if not threads:
2270            threads = self.get_threads()
2271
2272        # Info fields
2273        if remove_info:
2274            if not isinstance(remove_info, str):
2275                remove_info = "."
2276            info_field = f"""'{remove_info}' as INFO"""
2277        else:
2278            info_field = "INFO"
2279
2280        # Samples fields
2281        if add_samples:
2282            if not list_samples:
2283                list_samples = self.get_header_sample_list()
2284            if list_samples:
2285                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2286            else:
2287                samples_fields = ""
2288            log.debug(f"samples_fields: {samples_fields}")
2289        else:
2290            samples_fields = ""
2291
2292        # Variants
2293        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2294        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} """
2295
2296        return self.export_output(
2297            output_file=vcf_file,
2298            output_header=None,
2299            export_header=True,
2300            query=sql_query_select,
2301            parquet_partitions=None,
2302            chunk_size=config.get("chunk_size", None),
2303            threads=threads,
2304            sort=True,
2305            index=index,
2306            order_by=None,
2307        )
2308
2309    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2310        """
2311        It takes a list of commands and runs them in parallel using the number of threads specified
2312
2313        :param commands: A list of commands to run
2314        :param threads: The number of threads to use, defaults to 1 (optional)
2315        """
2316
2317        run_parallel_commands(commands, threads)
2318
2319    def get_threads(self, default: int = 1) -> int:
2320        """
2321        This function returns the number of threads to use for a job, with a default value of 1 if not
2322        specified.
2323
2324        :param default: The `default` parameter in the `get_threads` method is used to specify the
2325        default number of threads to use if no specific value is provided. If no value is provided for
2326        the `threads` parameter in the configuration or input parameters, the `default` value will be
2327        used, defaults to 1
2328        :type default: int (optional)
2329        :return: the number of threads to use for the current job.
2330        """
2331
2332        # Config
2333        config = self.get_config()
2334
2335        # Param
2336        param = self.get_param()
2337
2338        # Input threads
2339        input_thread = param.get("threads", config.get("threads", None))
2340
2341        # Check threads
2342        if not input_thread:
2343            threads = default
2344        elif int(input_thread) <= 0:
2345            threads = os.cpu_count()
2346        else:
2347            threads = int(input_thread)
2348        return threads
2349
2350    def get_memory(self, default: str = None) -> str:
2351        """
2352        This function retrieves the memory value from parameters or configuration with a default value
2353        if not found.
2354
2355        :param default: The `get_memory` function takes in a default value as a string parameter. This
2356        default value is used as a fallback in case the `memory` parameter is not provided in the
2357        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2358        the function
2359        :type default: str
2360        :return: The `get_memory` function returns a string value representing the memory parameter. If
2361        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2362        return the default value provided as an argument to the function.
2363        """
2364
2365        # Config
2366        config = self.get_config()
2367
2368        # Param
2369        param = self.get_param()
2370
2371        # Input threads
2372        input_memory = param.get("memory", config.get("memory", None))
2373
2374        # Check threads
2375        if input_memory:
2376            memory = input_memory
2377        else:
2378            memory = default
2379
2380        return memory
2381
2382    def update_from_vcf(self, vcf_file: str) -> None:
2383        """
2384        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2385
2386        :param vcf_file: the path to the VCF file
2387        """
2388
2389        connexion_format = self.get_connexion_format()
2390
2391        if connexion_format in ["duckdb"]:
2392            self.update_from_vcf_duckdb(vcf_file)
2393        elif connexion_format in ["sqlite"]:
2394            self.update_from_vcf_sqlite(vcf_file)
2395
2396    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2397        """
2398        It takes a VCF file and updates the INFO column of the variants table in the database with the
2399        INFO column of the VCF file
2400
2401        :param vcf_file: the path to the VCF file
2402        """
2403
2404        # varaints table
2405        table_variants = self.get_table_variants()
2406
2407        # Loading VCF into temporaire table
2408        skip = self.get_header_length(file=vcf_file)
2409        vcf_df = pd.read_csv(
2410            vcf_file,
2411            sep="\t",
2412            engine="c",
2413            skiprows=skip,
2414            header=0,
2415            low_memory=False,
2416        )
2417        sql_query_update = f"""
2418        UPDATE {table_variants} as table_variants
2419            SET INFO = concat(
2420                            CASE
2421                                WHEN INFO NOT IN ('', '.')
2422                                THEN INFO
2423                                ELSE ''
2424                            END,
2425                            (
2426                                SELECT 
2427                                    concat(
2428                                        CASE
2429                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2430                                            THEN ';'
2431                                            ELSE ''
2432                                        END
2433                                        ,
2434                                        CASE
2435                                            WHEN table_parquet.INFO NOT IN ('','.')
2436                                            THEN table_parquet.INFO
2437                                            ELSE ''
2438                                        END
2439                                    )
2440                                FROM vcf_df as table_parquet
2441                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2442                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2443                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2444                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2445                                        AND table_parquet.INFO NOT IN ('','.')
2446                            )
2447                        )
2448            ;
2449            """
2450        self.conn.execute(sql_query_update)
2451
2452    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2453        """
2454        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2455        table, then updates the INFO column of the variants table with the INFO column of the temporary
2456        table
2457
2458        :param vcf_file: The path to the VCF file you want to update the database with
2459        """
2460
2461        # Create a temporary table for the VCF
2462        table_vcf = "tmp_vcf"
2463        sql_create = (
2464            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2465        )
2466        self.conn.execute(sql_create)
2467
2468        # Loading VCF into temporaire table
2469        vcf_df = pd.read_csv(
2470            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2471        )
2472        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2473        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2474
2475        # Update table 'variants' with VCF data
2476        # warning: CONCAT as || operator
2477        sql_query_update = f"""
2478            UPDATE variants as table_variants
2479            SET INFO = CASE
2480                            WHEN INFO NOT IN ('', '.')
2481                            THEN INFO
2482                            ELSE ''
2483                        END ||
2484                        (
2485                        SELECT 
2486                            CASE 
2487                                WHEN table_variants.INFO NOT IN ('','.') 
2488                                    AND table_vcf.INFO NOT IN ('','.')  
2489                                THEN ';' 
2490                                ELSE '' 
2491                            END || 
2492                            CASE 
2493                                WHEN table_vcf.INFO NOT IN ('','.') 
2494                                THEN table_vcf.INFO 
2495                                ELSE '' 
2496                            END
2497                        FROM {table_vcf} as table_vcf
2498                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2499                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2500                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2501                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2502                        )
2503        """
2504        self.conn.execute(sql_query_update)
2505
2506        # Drop temporary table
2507        sql_drop = f"DROP TABLE {table_vcf}"
2508        self.conn.execute(sql_drop)
2509
2510    def drop_variants_table(self) -> None:
2511        """
2512        > This function drops the variants table
2513        """
2514
2515        table_variants = self.get_table_variants()
2516        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2517        self.conn.execute(sql_table_variants)
2518
2519    def set_variant_id(
2520        self, variant_id_column: str = "variant_id", force: bool = None
2521    ) -> str:
2522        """
2523        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2524        `#CHROM`, `POS`, `REF`, and `ALT` columns
2525
2526        :param variant_id_column: The name of the column to be created in the variants table, defaults
2527        to variant_id
2528        :type variant_id_column: str (optional)
2529        :param force: If True, the variant_id column will be created even if it already exists
2530        :type force: bool
2531        :return: The name of the column that contains the variant_id
2532        """
2533
2534        # Assembly
2535        assembly = self.get_param().get(
2536            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2537        )
2538
2539        # INFO/Tag prefix
2540        prefix = self.get_explode_infos_prefix()
2541
2542        # Explode INFO/SVTYPE
2543        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2544
2545        # variants table
2546        table_variants = self.get_table_variants()
2547
2548        # variant_id column
2549        if not variant_id_column:
2550            variant_id_column = "variant_id"
2551
2552        # Creta variant_id column
2553        if "variant_id" not in self.get_extra_infos() or force:
2554
2555            # Create column
2556            self.add_column(
2557                table_name=table_variants,
2558                column_name=variant_id_column,
2559                column_type="UBIGINT",
2560                default_value="0",
2561            )
2562
2563            # Update column
2564            self.conn.execute(
2565                f"""
2566                    UPDATE {table_variants}
2567                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2568                """
2569            )
2570
2571        # Remove added columns
2572        for added_column in added_columns:
2573            self.drop_column(column=added_column)
2574
2575        # return variant_id column name
2576        return variant_id_column
2577
2578    def get_variant_id_column(
2579        self, variant_id_column: str = "variant_id", force: bool = None
2580    ) -> str:
2581        """
2582        This function returns the variant_id column name
2583
2584        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2585        defaults to variant_id
2586        :type variant_id_column: str (optional)
2587        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2588        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2589        if it is not already set, or if it is set
2590        :type force: bool
2591        :return: The variant_id column name.
2592        """
2593
2594        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
2595
2596    ###
2597    # Annotation
2598    ###
2599
2600    def scan_databases(
2601        self, database_formats: list["parquet"], database_releases: list = ["current"]
2602    ) -> dict:
2603        """
2604        The function `scan_databases` scans for available databases based on specified formats and
2605        releases.
2606
2607        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2608        of the databases to be scanned. In this case, the accepted format is "parquet"
2609        :type database_formats: list ["parquet"]
2610        :param database_releases: The `database_releases` parameter is a list that specifies the
2611        releases of the databases to be scanned. In the provided function, the default value for
2612        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2613        databases that are in the "current"
2614        :type database_releases: list
2615        :return: The function `scan_databases` returns a dictionary containing information about
2616        databases that match the specified formats and releases.
2617        """
2618
2619        # Config
2620        config = self.get_config()
2621
2622        # Param
2623        param = self.get_param()
2624
2625        # Param - Assembly
2626        assembly = param.get("assembly", config.get("assembly", None))
2627        if not assembly:
2628            assembly = DEFAULT_ASSEMBLY
2629            log.warning(f"Default assembly '{assembly}'")
2630
2631        # Scan for availabled databases
2632        log.info(
2633            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2634        )
2635        databases_infos_dict = databases_infos(
2636            database_folder_releases=database_releases,
2637            database_formats=database_formats,
2638            assembly=assembly,
2639            config=config,
2640        )
2641        log.info(
2642            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2643        )
2644
2645        return databases_infos_dict
2646
2647    def annotation(self) -> None:
2648        """
2649        It annotates the VCF file with the annotations specified in the config file.
2650        """
2651
2652        # Config
2653        config = self.get_config()
2654
2655        # Param
2656        param = self.get_param()
2657
2658        # Param - Assembly
2659        assembly = param.get("assembly", config.get("assembly", None))
2660        if not assembly:
2661            assembly = DEFAULT_ASSEMBLY
2662            log.warning(f"Default assembly '{assembly}'")
2663
2664        # annotations databases folders
2665        annotations_databases = set(
2666            config.get("folders", {})
2667            .get("databases", {})
2668            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2669            + config.get("folders", {})
2670            .get("databases", {})
2671            .get("parquet", ["~/howard/databases/parquet/current"])
2672            + config.get("folders", {})
2673            .get("databases", {})
2674            .get("bcftools", ["~/howard/databases/bcftools/current"])
2675        )
2676
2677        # Get param annotations
2678        if param.get("annotations", None) and isinstance(
2679            param.get("annotations", None), str
2680        ):
2681            log.debug(param.get("annotations", None))
2682            param_annotation_list = param.get("annotations").split(",")
2683        else:
2684            param_annotation_list = []
2685
2686        # Each tools param
2687        if param.get("annotation_parquet", None) != None:
2688            log.debug(
2689                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2690            )
2691            if isinstance(param.get("annotation_parquet", None), list):
2692                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2693            else:
2694                param_annotation_list.append(param.get("annotation_parquet"))
2695        if param.get("annotation_snpsift", None) != None:
2696            if isinstance(param.get("annotation_snpsift", None), list):
2697                param_annotation_list.append(
2698                    "snpsift:"
2699                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2700                )
2701            else:
2702                param_annotation_list.append(
2703                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2704                )
2705        if param.get("annotation_snpeff", None) != None:
2706            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2707        if param.get("annotation_bcftools", None) != None:
2708            if isinstance(param.get("annotation_bcftools", None), list):
2709                param_annotation_list.append(
2710                    "bcftools:"
2711                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2712                )
2713            else:
2714                param_annotation_list.append(
2715                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2716                )
2717        if param.get("annotation_annovar", None) != None:
2718            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2719        if param.get("annotation_exomiser", None) != None:
2720            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2721        if param.get("annotation_splice", None) != None:
2722            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2723
2724        # Merge param annotations list
2725        param["annotations"] = ",".join(param_annotation_list)
2726
2727        # debug
2728        log.debug(f"param_annotations={param['annotations']}")
2729
2730        if param.get("annotations"):
2731
2732            # Log
2733            # log.info("Annotations - Check annotation parameters")
2734
2735            if not "annotation" in param:
2736                param["annotation"] = {}
2737
2738            # List of annotations parameters
2739            annotations_list_input = {}
2740            if isinstance(param.get("annotations", None), str):
2741                annotation_file_list = [
2742                    value for value in param.get("annotations", "").split(",")
2743                ]
2744                for annotation_file in annotation_file_list:
2745                    annotations_list_input[annotation_file] = {"INFO": None}
2746            else:
2747                annotations_list_input = param.get("annotations", {})
2748
2749            log.info(f"Quick Annotations:")
2750            for annotation_key in list(annotations_list_input.keys()):
2751                log.info(f"   {annotation_key}")
2752
2753            # List of annotations and associated fields
2754            annotations_list = {}
2755
2756            for annotation_file in annotations_list_input:
2757
2758                # Explode annotations if ALL
2759                if (
2760                    annotation_file.upper() == "ALL"
2761                    or annotation_file.upper().startswith("ALL:")
2762                ):
2763
2764                    # check ALL parameters (formats, releases)
2765                    annotation_file_split = annotation_file.split(":")
2766                    database_formats = "parquet"
2767                    database_releases = "current"
2768                    for annotation_file_option in annotation_file_split[1:]:
2769                        database_all_options_split = annotation_file_option.split("=")
2770                        if database_all_options_split[0] == "format":
2771                            database_formats = database_all_options_split[1].split("+")
2772                        if database_all_options_split[0] == "release":
2773                            database_releases = database_all_options_split[1].split("+")
2774
2775                    # Scan for availabled databases
2776                    databases_infos_dict = self.scan_databases(
2777                        database_formats=database_formats,
2778                        database_releases=database_releases,
2779                    )
2780
2781                    # Add found databases in annotation parameters
2782                    for database_infos in databases_infos_dict.keys():
2783                        annotations_list[database_infos] = {"INFO": None}
2784
2785                else:
2786                    annotations_list[annotation_file] = annotations_list_input[
2787                        annotation_file
2788                    ]
2789
2790            # Check each databases
2791            if len(annotations_list):
2792
2793                log.info(
2794                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2795                )
2796
2797                for annotation_file in annotations_list:
2798
2799                    # Init
2800                    annotations = annotations_list.get(annotation_file, None)
2801
2802                    # Annotation snpEff
2803                    if annotation_file.startswith("snpeff"):
2804
2805                        log.debug(f"Quick Annotation snpEff")
2806
2807                        if "snpeff" not in param["annotation"]:
2808                            param["annotation"]["snpeff"] = {}
2809
2810                        if "options" not in param["annotation"]["snpeff"]:
2811                            param["annotation"]["snpeff"]["options"] = ""
2812
2813                        # snpEff options in annotations
2814                        param["annotation"]["snpeff"]["options"] = "".join(
2815                            annotation_file.split(":")[1:]
2816                        )
2817
2818                    # Annotation Annovar
2819                    elif annotation_file.startswith("annovar"):
2820
2821                        log.debug(f"Quick Annotation Annovar")
2822
2823                        if "annovar" not in param["annotation"]:
2824                            param["annotation"]["annovar"] = {}
2825
2826                        if "annotations" not in param["annotation"]["annovar"]:
2827                            param["annotation"]["annovar"]["annotations"] = {}
2828
2829                        # Options
2830                        annotation_file_split = annotation_file.split(":")
2831                        for annotation_file_annotation in annotation_file_split[1:]:
2832                            if annotation_file_annotation:
2833                                param["annotation"]["annovar"]["annotations"][
2834                                    annotation_file_annotation
2835                                ] = annotations
2836
2837                    # Annotation Exomiser
2838                    elif annotation_file.startswith("exomiser"):
2839
2840                        log.debug(f"Quick Annotation Exomiser")
2841
2842                        param["annotation"]["exomiser"] = params_string_to_dict(
2843                            annotation_file
2844                        )
2845
2846                    # Annotation Splice
2847                    elif annotation_file.startswith("splice"):
2848
2849                        log.debug(f"Quick Annotation Splice")
2850
2851                        param["annotation"]["splice"] = params_string_to_dict(
2852                            annotation_file
2853                        )
2854
2855                    # Annotation Parquet or BCFTOOLS
2856                    else:
2857
2858                        # Tools detection
2859                        if annotation_file.startswith("bcftools:"):
2860                            annotation_tool_initial = "bcftools"
2861                            annotation_file = ":".join(annotation_file.split(":")[1:])
2862                        elif annotation_file.startswith("snpsift:"):
2863                            annotation_tool_initial = "snpsift"
2864                            annotation_file = ":".join(annotation_file.split(":")[1:])
2865                        else:
2866                            annotation_tool_initial = None
2867
2868                        # list of files
2869                        annotation_file_list = annotation_file.replace("+", ":").split(
2870                            ":"
2871                        )
2872
2873                        for annotation_file in annotation_file_list:
2874
2875                            if annotation_file:
2876
2877                                # Annotation tool initial
2878                                annotation_tool = annotation_tool_initial
2879
2880                                # Find file
2881                                annotation_file_found = None
2882
2883                                # Expand user
2884                                annotation_file = full_path(annotation_file)
2885
2886                                if os.path.exists(annotation_file):
2887                                    annotation_file_found = annotation_file
2888
2889                                else:
2890                                    # Find within assembly folders
2891                                    for annotations_database in annotations_databases:
2892                                        found_files = find_all(
2893                                            annotation_file,
2894                                            os.path.join(
2895                                                annotations_database, assembly
2896                                            ),
2897                                        )
2898                                        if len(found_files) > 0:
2899                                            annotation_file_found = found_files[0]
2900                                            break
2901                                    if not annotation_file_found and not assembly:
2902                                        # Find within folders
2903                                        for (
2904                                            annotations_database
2905                                        ) in annotations_databases:
2906                                            found_files = find_all(
2907                                                annotation_file, annotations_database
2908                                            )
2909                                            if len(found_files) > 0:
2910                                                annotation_file_found = found_files[0]
2911                                                break
2912                                log.debug(
2913                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2914                                )
2915
2916                                # Full path
2917                                annotation_file_found = full_path(annotation_file_found)
2918
2919                                if annotation_file_found:
2920
2921                                    database = Database(database=annotation_file_found)
2922                                    quick_annotation_format = database.get_format()
2923                                    quick_annotation_is_compressed = (
2924                                        database.is_compressed()
2925                                    )
2926                                    quick_annotation_is_indexed = os.path.exists(
2927                                        f"{annotation_file_found}.tbi"
2928                                    )
2929                                    bcftools_preference = False
2930
2931                                    # Check Annotation Tool
2932                                    if not annotation_tool:
2933                                        if (
2934                                            bcftools_preference
2935                                            and quick_annotation_format
2936                                            in ["vcf", "bed"]
2937                                            and quick_annotation_is_compressed
2938                                            and quick_annotation_is_indexed
2939                                        ):
2940                                            annotation_tool = "bcftools"
2941                                        elif quick_annotation_format in [
2942                                            "vcf",
2943                                            "bed",
2944                                            "tsv",
2945                                            "tsv",
2946                                            "csv",
2947                                            "json",
2948                                            "tbl",
2949                                            "parquet",
2950                                            "duckdb",
2951                                        ]:
2952                                            annotation_tool = "parquet"
2953                                        else:
2954                                            log.error(
2955                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
2956                                            )
2957                                            raise ValueError(
2958                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
2959                                            )
2960
2961                                    log.debug(
2962                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
2963                                    )
2964
2965                                    # Annotation Tool dispatch
2966                                    if annotation_tool:
2967                                        if annotation_tool not in param["annotation"]:
2968                                            param["annotation"][annotation_tool] = {}
2969                                        if (
2970                                            "annotations"
2971                                            not in param["annotation"][annotation_tool]
2972                                        ):
2973                                            param["annotation"][annotation_tool][
2974                                                "annotations"
2975                                            ] = {}
2976                                        param["annotation"][annotation_tool][
2977                                            "annotations"
2978                                        ][annotation_file_found] = annotations
2979
2980                                else:
2981                                    log.error(
2982                                        f"Quick Annotation File {annotation_file} does NOT exist"
2983                                    )
2984
2985                self.set_param(param)
2986
2987        if param.get("annotation", None):
2988            log.info("Annotations")
2989            if param.get("annotation", {}).get("parquet", None):
2990                log.info("Annotations 'parquet'...")
2991                self.annotation_parquet()
2992            if param.get("annotation", {}).get("bcftools", None):
2993                log.info("Annotations 'bcftools'...")
2994                self.annotation_bcftools()
2995            if param.get("annotation", {}).get("snpsift", None):
2996                log.info("Annotations 'snpsift'...")
2997                self.annotation_snpsift()
2998            if param.get("annotation", {}).get("annovar", None):
2999                log.info("Annotations 'annovar'...")
3000                self.annotation_annovar()
3001            if param.get("annotation", {}).get("snpeff", None):
3002                log.info("Annotations 'snpeff'...")
3003                self.annotation_snpeff()
3004            if param.get("annotation", {}).get("exomiser", None) is not None:
3005                log.info("Annotations 'exomiser'...")
3006                self.annotation_exomiser()
3007            if param.get("annotation", {}).get("splice", None) is not None:
3008                log.info("Annotations 'splice' ...")
3009                self.annotation_splice()
3010
3011        # Explode INFOS fields into table fields
3012        if self.get_explode_infos():
3013            self.explode_infos(
3014                prefix=self.get_explode_infos_prefix(),
3015                fields=self.get_explode_infos_fields(),
3016                force=True,
3017            )
3018
3019    def annotation_snpsift(self, threads: int = None) -> None:
3020        """
3021        This function annotate with bcftools
3022
3023        :param threads: Number of threads to use
3024        :return: the value of the variable "return_value".
3025        """
3026
3027        # DEBUG
3028        log.debug("Start annotation with bcftools databases")
3029
3030        # Threads
3031        if not threads:
3032            threads = self.get_threads()
3033        log.debug("Threads: " + str(threads))
3034
3035        # Config
3036        config = self.get_config()
3037        log.debug("Config: " + str(config))
3038
3039        # Config - snpSift
3040        snpsift_bin_command = get_bin_command(
3041            bin="SnpSift.jar",
3042            tool="snpsift",
3043            bin_type="jar",
3044            config=config,
3045            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3046        )
3047        if not snpsift_bin_command:
3048            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3049            log.error(msg_err)
3050            raise ValueError(msg_err)
3051
3052        # Config - bcftools
3053        bcftools_bin_command = get_bin_command(
3054            bin="bcftools",
3055            tool="bcftools",
3056            bin_type="bin",
3057            config=config,
3058            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3059        )
3060        if not bcftools_bin_command:
3061            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3062            log.error(msg_err)
3063            raise ValueError(msg_err)
3064
3065        # Config - BCFTools databases folders
3066        databases_folders = set(
3067            self.get_config()
3068            .get("folders", {})
3069            .get("databases", {})
3070            .get("annotations", ["."])
3071            + self.get_config()
3072            .get("folders", {})
3073            .get("databases", {})
3074            .get("bcftools", ["."])
3075        )
3076        log.debug("Databases annotations: " + str(databases_folders))
3077
3078        # Param
3079        annotations = (
3080            self.get_param()
3081            .get("annotation", {})
3082            .get("snpsift", {})
3083            .get("annotations", None)
3084        )
3085        log.debug("Annotations: " + str(annotations))
3086
3087        # Assembly
3088        assembly = self.get_param().get(
3089            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3090        )
3091
3092        # Data
3093        table_variants = self.get_table_variants()
3094
3095        # Check if not empty
3096        log.debug("Check if not empty")
3097        sql_query_chromosomes = (
3098            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3099        )
3100        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3101        if not sql_query_chromosomes_df["count"][0]:
3102            log.info(f"VCF empty")
3103            return
3104
3105        # VCF header
3106        vcf_reader = self.get_header()
3107        log.debug("Initial header: " + str(vcf_reader.infos))
3108
3109        # Existing annotations
3110        for vcf_annotation in self.get_header().infos:
3111
3112            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3113            log.debug(
3114                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3115            )
3116
3117        if annotations:
3118
3119            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3120
3121                # Export VCF file
3122                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3123
3124                # Init
3125                commands = {}
3126
3127                for annotation in annotations:
3128                    annotation_fields = annotations[annotation]
3129
3130                    # Annotation Name
3131                    annotation_name = os.path.basename(annotation)
3132
3133                    if not annotation_fields:
3134                        annotation_fields = {"INFO": None}
3135
3136                    log.debug(f"Annotation '{annotation_name}'")
3137                    log.debug(
3138                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3139                    )
3140
3141                    # Create Database
3142                    database = Database(
3143                        database=annotation,
3144                        databases_folders=databases_folders,
3145                        assembly=assembly,
3146                    )
3147
3148                    # Find files
3149                    db_file = database.get_database()
3150                    db_file = full_path(db_file)
3151                    db_hdr_file = database.get_header_file()
3152                    db_hdr_file = full_path(db_hdr_file)
3153                    db_file_type = database.get_format()
3154                    db_tbi_file = f"{db_file}.tbi"
3155                    db_file_compressed = database.is_compressed()
3156
3157                    # Check if compressed
3158                    if not db_file_compressed:
3159                        log.error(
3160                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3161                        )
3162                        raise ValueError(
3163                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3164                        )
3165
3166                    # Check if indexed
3167                    if not os.path.exists(db_tbi_file):
3168                        log.error(
3169                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3170                        )
3171                        raise ValueError(
3172                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3173                        )
3174
3175                    # Check index - try to create if not exists
3176                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3177                        log.error("Annotation failed: database not valid")
3178                        log.error(f"Annotation annotation file: {db_file}")
3179                        log.error(f"Annotation annotation header: {db_hdr_file}")
3180                        log.error(f"Annotation annotation index: {db_tbi_file}")
3181                        raise ValueError(
3182                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3183                        )
3184                    else:
3185
3186                        log.debug(
3187                            f"Annotation '{annotation}' - file: "
3188                            + str(db_file)
3189                            + " and "
3190                            + str(db_hdr_file)
3191                        )
3192
3193                        # Load header as VCF object
3194                        db_hdr_vcf = Variants(input=db_hdr_file)
3195                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3196                        log.debug(
3197                            "Annotation database header: "
3198                            + str(db_hdr_vcf_header_infos)
3199                        )
3200
3201                        # For all fields in database
3202                        annotation_fields_full = False
3203                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3204                            annotation_fields = {
3205                                key: key for key in db_hdr_vcf_header_infos
3206                            }
3207                            log.debug(
3208                                "Annotation database header - All annotations added: "
3209                                + str(annotation_fields)
3210                            )
3211                            annotation_fields_full = True
3212
3213                        # # Create file for field rename
3214                        # log.debug("Create file for field rename")
3215                        # tmp_rename = NamedTemporaryFile(
3216                        #     prefix=self.get_prefix(),
3217                        #     dir=self.get_tmp_dir(),
3218                        #     suffix=".rename",
3219                        #     delete=False,
3220                        # )
3221                        # tmp_rename_name = tmp_rename.name
3222                        # tmp_files.append(tmp_rename_name)
3223
3224                        # Number of fields
3225                        nb_annotation_field = 0
3226                        annotation_list = []
3227                        annotation_infos_rename_list = []
3228
3229                        for annotation_field in annotation_fields:
3230
3231                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3232                            annotation_fields_new_name = annotation_fields.get(
3233                                annotation_field, annotation_field
3234                            )
3235                            if not annotation_fields_new_name:
3236                                annotation_fields_new_name = annotation_field
3237
3238                            # Check if field is in DB and if field is not elready in input data
3239                            if (
3240                                annotation_field in db_hdr_vcf.get_header().infos
3241                                and annotation_fields_new_name
3242                                not in self.get_header().infos
3243                            ):
3244
3245                                log.info(
3246                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3247                                )
3248
3249                                # BCFTools annotate param to rename fields
3250                                if annotation_field != annotation_fields_new_name:
3251                                    annotation_infos_rename_list.append(
3252                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3253                                    )
3254
3255                                # Add INFO field to header
3256                                db_hdr_vcf_header_infos_number = (
3257                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3258                                )
3259                                db_hdr_vcf_header_infos_type = (
3260                                    db_hdr_vcf_header_infos[annotation_field].type
3261                                    or "String"
3262                                )
3263                                db_hdr_vcf_header_infos_description = (
3264                                    db_hdr_vcf_header_infos[annotation_field].desc
3265                                    or f"{annotation_field} description"
3266                                )
3267                                db_hdr_vcf_header_infos_source = (
3268                                    db_hdr_vcf_header_infos[annotation_field].source
3269                                    or "unknown"
3270                                )
3271                                db_hdr_vcf_header_infos_version = (
3272                                    db_hdr_vcf_header_infos[annotation_field].version
3273                                    or "unknown"
3274                                )
3275
3276                                vcf_reader.infos[annotation_fields_new_name] = (
3277                                    vcf.parser._Info(
3278                                        annotation_fields_new_name,
3279                                        db_hdr_vcf_header_infos_number,
3280                                        db_hdr_vcf_header_infos_type,
3281                                        db_hdr_vcf_header_infos_description,
3282                                        db_hdr_vcf_header_infos_source,
3283                                        db_hdr_vcf_header_infos_version,
3284                                        self.code_type_map[
3285                                            db_hdr_vcf_header_infos_type
3286                                        ],
3287                                    )
3288                                )
3289
3290                                annotation_list.append(annotation_field)
3291
3292                                nb_annotation_field += 1
3293
3294                            else:
3295
3296                                if (
3297                                    annotation_field
3298                                    not in db_hdr_vcf.get_header().infos
3299                                ):
3300                                    log.warning(
3301                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3302                                    )
3303                                if (
3304                                    annotation_fields_new_name
3305                                    in self.get_header().infos
3306                                ):
3307                                    log.warning(
3308                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3309                                    )
3310
3311                        log.info(
3312                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3313                        )
3314
3315                        annotation_infos = ",".join(annotation_list)
3316
3317                        if annotation_infos != "":
3318
3319                            # Annotated VCF (and error file)
3320                            tmp_annotation_vcf_name = os.path.join(
3321                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3322                            )
3323                            tmp_annotation_vcf_name_err = (
3324                                tmp_annotation_vcf_name + ".err"
3325                            )
3326
3327                            # Add fields to annotate
3328                            if not annotation_fields_full:
3329                                annotation_infos_option = f"-info {annotation_infos}"
3330                            else:
3331                                annotation_infos_option = ""
3332
3333                            # Info fields rename
3334                            if annotation_infos_rename_list:
3335                                annotation_infos_rename = " -c " + ",".join(
3336                                    annotation_infos_rename_list
3337                                )
3338                            else:
3339                                annotation_infos_rename = ""
3340
3341                            # Annotate command
3342                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3343
3344                            # Add command
3345                            commands[command_annotate] = tmp_annotation_vcf_name
3346
3347                if commands:
3348
3349                    # Export VCF file
3350                    self.export_variant_vcf(
3351                        vcf_file=tmp_vcf_name,
3352                        remove_info=True,
3353                        add_samples=False,
3354                        index=True,
3355                    )
3356                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3357
3358                    # Num command
3359                    nb_command = 0
3360
3361                    # Annotate
3362                    for command_annotate in commands:
3363                        nb_command += 1
3364                        log.info(
3365                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3366                        )
3367                        log.debug(f"command_annotate={command_annotate}")
3368                        run_parallel_commands([command_annotate], threads)
3369
3370                        # Debug
3371                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3372
3373                        # Update variants
3374                        log.info(
3375                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3376                        )
3377                        self.update_from_vcf(commands[command_annotate])
3378
3379    def annotation_bcftools(self, threads: int = None) -> None:
3380        """
3381        This function annotate with bcftools
3382
3383        :param threads: Number of threads to use
3384        :return: the value of the variable "return_value".
3385        """
3386
3387        # DEBUG
3388        log.debug("Start annotation with bcftools databases")
3389
3390        # Threads
3391        if not threads:
3392            threads = self.get_threads()
3393        log.debug("Threads: " + str(threads))
3394
3395        # Config
3396        config = self.get_config()
3397        log.debug("Config: " + str(config))
3398
3399        # DEBUG
3400        delete_tmp = True
3401        if self.get_config().get("verbosity", "warning") in ["debug"]:
3402            delete_tmp = False
3403            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3404
3405        # Config - BCFTools bin command
3406        bcftools_bin_command = get_bin_command(
3407            bin="bcftools",
3408            tool="bcftools",
3409            bin_type="bin",
3410            config=config,
3411            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3412        )
3413        if not bcftools_bin_command:
3414            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3415            log.error(msg_err)
3416            raise ValueError(msg_err)
3417
3418        # Config - BCFTools databases folders
3419        databases_folders = set(
3420            self.get_config()
3421            .get("folders", {})
3422            .get("databases", {})
3423            .get("annotations", ["."])
3424            + self.get_config()
3425            .get("folders", {})
3426            .get("databases", {})
3427            .get("bcftools", ["."])
3428        )
3429        log.debug("Databases annotations: " + str(databases_folders))
3430
3431        # Param
3432        annotations = (
3433            self.get_param()
3434            .get("annotation", {})
3435            .get("bcftools", {})
3436            .get("annotations", None)
3437        )
3438        log.debug("Annotations: " + str(annotations))
3439
3440        # Assembly
3441        assembly = self.get_param().get(
3442            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3443        )
3444
3445        # Data
3446        table_variants = self.get_table_variants()
3447
3448        # Check if not empty
3449        log.debug("Check if not empty")
3450        sql_query_chromosomes = (
3451            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3452        )
3453        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3454        if not sql_query_chromosomes_df["count"][0]:
3455            log.info(f"VCF empty")
3456            return
3457
3458        # Export in VCF
3459        log.debug("Create initial file to annotate")
3460        tmp_vcf = NamedTemporaryFile(
3461            prefix=self.get_prefix(),
3462            dir=self.get_tmp_dir(),
3463            suffix=".vcf.gz",
3464            delete=False,
3465        )
3466        tmp_vcf_name = tmp_vcf.name
3467
3468        # VCF header
3469        vcf_reader = self.get_header()
3470        log.debug("Initial header: " + str(vcf_reader.infos))
3471
3472        # Existing annotations
3473        for vcf_annotation in self.get_header().infos:
3474
3475            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3476            log.debug(
3477                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3478            )
3479
3480        if annotations:
3481
3482            tmp_ann_vcf_list = []
3483            commands = []
3484            tmp_files = []
3485            err_files = []
3486
3487            for annotation in annotations:
3488                annotation_fields = annotations[annotation]
3489
3490                # Annotation Name
3491                annotation_name = os.path.basename(annotation)
3492
3493                if not annotation_fields:
3494                    annotation_fields = {"INFO": None}
3495
3496                log.debug(f"Annotation '{annotation_name}'")
3497                log.debug(
3498                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3499                )
3500
3501                # Create Database
3502                database = Database(
3503                    database=annotation,
3504                    databases_folders=databases_folders,
3505                    assembly=assembly,
3506                )
3507
3508                # Find files
3509                db_file = database.get_database()
3510                db_file = full_path(db_file)
3511                db_hdr_file = database.get_header_file()
3512                db_hdr_file = full_path(db_hdr_file)
3513                db_file_type = database.get_format()
3514                db_tbi_file = f"{db_file}.tbi"
3515                db_file_compressed = database.is_compressed()
3516
3517                # Check if compressed
3518                if not db_file_compressed:
3519                    log.error(
3520                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3521                    )
3522                    raise ValueError(
3523                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3524                    )
3525
3526                # Check if indexed
3527                if not os.path.exists(db_tbi_file):
3528                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3529                    raise ValueError(
3530                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3531                    )
3532
3533                # Check index - try to create if not exists
3534                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3535                    log.error("Annotation failed: database not valid")
3536                    log.error(f"Annotation annotation file: {db_file}")
3537                    log.error(f"Annotation annotation header: {db_hdr_file}")
3538                    log.error(f"Annotation annotation index: {db_tbi_file}")
3539                    raise ValueError(
3540                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3541                    )
3542                else:
3543
3544                    log.debug(
3545                        f"Annotation '{annotation}' - file: "
3546                        + str(db_file)
3547                        + " and "
3548                        + str(db_hdr_file)
3549                    )
3550
3551                    # Load header as VCF object
3552                    db_hdr_vcf = Variants(input=db_hdr_file)
3553                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3554                    log.debug(
3555                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3556                    )
3557
3558                    # For all fields in database
3559                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3560                        annotation_fields = {
3561                            key: key for key in db_hdr_vcf_header_infos
3562                        }
3563                        log.debug(
3564                            "Annotation database header - All annotations added: "
3565                            + str(annotation_fields)
3566                        )
3567
3568                    # Number of fields
3569                    nb_annotation_field = 0
3570                    annotation_list = []
3571
3572                    for annotation_field in annotation_fields:
3573
3574                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3575                        annotation_fields_new_name = annotation_fields.get(
3576                            annotation_field, annotation_field
3577                        )
3578                        if not annotation_fields_new_name:
3579                            annotation_fields_new_name = annotation_field
3580
3581                        # Check if field is in DB and if field is not elready in input data
3582                        if (
3583                            annotation_field in db_hdr_vcf.get_header().infos
3584                            and annotation_fields_new_name
3585                            not in self.get_header().infos
3586                        ):
3587
3588                            log.info(
3589                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3590                            )
3591
3592                            # Add INFO field to header
3593                            db_hdr_vcf_header_infos_number = (
3594                                db_hdr_vcf_header_infos[annotation_field].num or "."
3595                            )
3596                            db_hdr_vcf_header_infos_type = (
3597                                db_hdr_vcf_header_infos[annotation_field].type
3598                                or "String"
3599                            )
3600                            db_hdr_vcf_header_infos_description = (
3601                                db_hdr_vcf_header_infos[annotation_field].desc
3602                                or f"{annotation_field} description"
3603                            )
3604                            db_hdr_vcf_header_infos_source = (
3605                                db_hdr_vcf_header_infos[annotation_field].source
3606                                or "unknown"
3607                            )
3608                            db_hdr_vcf_header_infos_version = (
3609                                db_hdr_vcf_header_infos[annotation_field].version
3610                                or "unknown"
3611                            )
3612
3613                            vcf_reader.infos[annotation_fields_new_name] = (
3614                                vcf.parser._Info(
3615                                    annotation_fields_new_name,
3616                                    db_hdr_vcf_header_infos_number,
3617                                    db_hdr_vcf_header_infos_type,
3618                                    db_hdr_vcf_header_infos_description,
3619                                    db_hdr_vcf_header_infos_source,
3620                                    db_hdr_vcf_header_infos_version,
3621                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3622                                )
3623                            )
3624
3625                            # annotation_list.append(annotation_field)
3626                            if annotation_field != annotation_fields_new_name:
3627                                annotation_list.append(
3628                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3629                                )
3630                            else:
3631                                annotation_list.append(annotation_field)
3632
3633                            nb_annotation_field += 1
3634
3635                        else:
3636
3637                            if annotation_field not in db_hdr_vcf.get_header().infos:
3638                                log.warning(
3639                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3640                                )
3641                            if annotation_fields_new_name in self.get_header().infos:
3642                                log.warning(
3643                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3644                                )
3645
3646                    log.info(
3647                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3648                    )
3649
3650                    annotation_infos = ",".join(annotation_list)
3651
3652                    if annotation_infos != "":
3653
3654                        # Protect header for bcftools (remove "#CHROM" and variants line)
3655                        log.debug("Protect Header file - remove #CHROM line if exists")
3656                        tmp_header_vcf = NamedTemporaryFile(
3657                            prefix=self.get_prefix(),
3658                            dir=self.get_tmp_dir(),
3659                            suffix=".hdr",
3660                            delete=False,
3661                        )
3662                        tmp_header_vcf_name = tmp_header_vcf.name
3663                        tmp_files.append(tmp_header_vcf_name)
3664                        # Command
3665                        if db_hdr_file.endswith(".gz"):
3666                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3667                        else:
3668                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3669                        # Run
3670                        run_parallel_commands([command_extract_header], 1)
3671
3672                        # Find chomosomes
3673                        log.debug("Find chromosomes ")
3674                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3675                        sql_query_chromosomes_df = self.get_query_to_df(
3676                            sql_query_chromosomes
3677                        )
3678                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3679
3680                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3681
3682                        # BED columns in the annotation file
3683                        if db_file_type in ["bed"]:
3684                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3685
3686                        for chrom in chomosomes_list:
3687
3688                            # Create BED on initial VCF
3689                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3690                            tmp_bed = NamedTemporaryFile(
3691                                prefix=self.get_prefix(),
3692                                dir=self.get_tmp_dir(),
3693                                suffix=".bed",
3694                                delete=False,
3695                            )
3696                            tmp_bed_name = tmp_bed.name
3697                            tmp_files.append(tmp_bed_name)
3698
3699                            # Detecte regions
3700                            log.debug(
3701                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3702                            )
3703                            window = 1000000
3704                            sql_query_intervals_for_bed = f"""
3705                                SELECT  \"#CHROM\",
3706                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3707                                        \"POS\"+{window}
3708                                FROM {table_variants} as table_variants
3709                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3710                            """
3711                            regions = self.conn.execute(
3712                                sql_query_intervals_for_bed
3713                            ).fetchall()
3714                            merged_regions = merge_regions(regions)
3715                            log.debug(
3716                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3717                            )
3718
3719                            header = ["#CHROM", "START", "END"]
3720                            with open(tmp_bed_name, "w") as f:
3721                                # Write the header with tab delimiter
3722                                f.write("\t".join(header) + "\n")
3723                                for d in merged_regions:
3724                                    # Write each data row with tab delimiter
3725                                    f.write("\t".join(map(str, d)) + "\n")
3726
3727                            # Tmp files
3728                            tmp_annotation_vcf = NamedTemporaryFile(
3729                                prefix=self.get_prefix(),
3730                                dir=self.get_tmp_dir(),
3731                                suffix=".vcf.gz",
3732                                delete=False,
3733                            )
3734                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3735                            tmp_files.append(tmp_annotation_vcf_name)
3736                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3737                            tmp_annotation_vcf_name_err = (
3738                                tmp_annotation_vcf_name + ".err"
3739                            )
3740                            err_files.append(tmp_annotation_vcf_name_err)
3741
3742                            # Annotate Command
3743                            log.debug(
3744                                f"Annotation '{annotation}' - add bcftools command"
3745                            )
3746
3747                            # Command
3748                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3749
3750                            # Add command
3751                            commands.append(command_annotate)
3752
3753            # if some commands
3754            if commands:
3755
3756                # Export VCF file
3757                self.export_variant_vcf(
3758                    vcf_file=tmp_vcf_name,
3759                    remove_info=True,
3760                    add_samples=False,
3761                    index=True,
3762                )
3763
3764                # Threads
3765                # calculate threads for annotated commands
3766                if commands:
3767                    threads_bcftools_annotate = round(threads / len(commands))
3768                else:
3769                    threads_bcftools_annotate = 1
3770
3771                if not threads_bcftools_annotate:
3772                    threads_bcftools_annotate = 1
3773
3774                # Add threads option to bcftools commands
3775                if threads_bcftools_annotate > 1:
3776                    commands_threaded = []
3777                    for command in commands:
3778                        commands_threaded.append(
3779                            command.replace(
3780                                f"{bcftools_bin_command} annotate ",
3781                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3782                            )
3783                        )
3784                    commands = commands_threaded
3785
3786                # Command annotation multithreading
3787                log.debug(f"Annotation - Annotation commands: " + str(commands))
3788                log.info(
3789                    f"Annotation - Annotation multithreaded in "
3790                    + str(len(commands))
3791                    + " commands"
3792                )
3793
3794                run_parallel_commands(commands, threads)
3795
3796                # Merge
3797                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3798
3799                if tmp_ann_vcf_list_cmd:
3800
3801                    # Tmp file
3802                    tmp_annotate_vcf = NamedTemporaryFile(
3803                        prefix=self.get_prefix(),
3804                        dir=self.get_tmp_dir(),
3805                        suffix=".vcf.gz",
3806                        delete=True,
3807                    )
3808                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3809                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3810                    err_files.append(tmp_annotate_vcf_name_err)
3811
3812                    # Tmp file remove command
3813                    tmp_files_remove_command = ""
3814                    if tmp_files:
3815                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3816
3817                    # Command merge
3818                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3819                    log.info(
3820                        f"Annotation - Annotation merging "
3821                        + str(len(commands))
3822                        + " annotated files"
3823                    )
3824                    log.debug(f"Annotation - merge command: {merge_command}")
3825                    run_parallel_commands([merge_command], 1)
3826
3827                    # Error messages
3828                    log.info(f"Error/Warning messages:")
3829                    error_message_command_all = []
3830                    error_message_command_warning = []
3831                    error_message_command_err = []
3832                    for err_file in err_files:
3833                        with open(err_file, "r") as f:
3834                            for line in f:
3835                                message = line.strip()
3836                                error_message_command_all.append(message)
3837                                if line.startswith("[W::"):
3838                                    error_message_command_warning.append(message)
3839                                if line.startswith("[E::"):
3840                                    error_message_command_err.append(
3841                                        f"{err_file}: " + message
3842                                    )
3843                    # log info
3844                    for message in list(
3845                        set(error_message_command_err + error_message_command_warning)
3846                    ):
3847                        log.info(f"   {message}")
3848                    # debug info
3849                    for message in list(set(error_message_command_all)):
3850                        log.debug(f"   {message}")
3851                    # failed
3852                    if len(error_message_command_err):
3853                        log.error("Annotation failed: Error in commands")
3854                        raise ValueError("Annotation failed: Error in commands")
3855
3856                    # Update variants
3857                    log.info(f"Annotation - Updating...")
3858                    self.update_from_vcf(tmp_annotate_vcf_name)
3859
3860    def annotation_exomiser(self, threads: int = None) -> None:
3861        """
3862        This function annotate with Exomiser
3863
3864        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3865        - "analysis" (dict/file):
3866            Full analysis dictionnary parameters (see Exomiser docs).
3867            Either a dict, or a file in JSON or YAML format.
3868            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3869            Default : None
3870        - "preset" (string):
3871            Analysis preset (available in config folder).
3872            Used if no full "analysis" is provided.
3873            Default: "exome"
3874        - "phenopacket" (dict/file):
3875            Samples and phenotipic features parameters (see Exomiser docs).
3876            Either a dict, or a file in JSON or YAML format.
3877            Default: None
3878        - "subject" (dict):
3879            Sample parameters (see Exomiser docs).
3880            Example:
3881                "subject":
3882                    {
3883                        "id": "ISDBM322017",
3884                        "sex": "FEMALE"
3885                    }
3886            Default: None
3887        - "sample" (string):
3888            Sample name to construct "subject" section:
3889                "subject":
3890                    {
3891                        "id": "<sample>",
3892                        "sex": "UNKNOWN_SEX"
3893                    }
3894            Default: None
3895        - "phenotypicFeatures" (dict)
3896            Phenotypic features to construct "subject" section.
3897            Example:
3898                "phenotypicFeatures":
3899                    [
3900                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3901                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3902                    ]
3903        - "hpo" (list)
3904            List of HPO ids as phenotypic features.
3905            Example:
3906                "hpo": ['0001156', '0001363', '0011304', '0010055']
3907            Default: []
3908        - "outputOptions" (dict):
3909            Output options (see Exomiser docs).
3910            Default:
3911                "output_options" =
3912                    {
3913                        "outputContributingVariantsOnly": False,
3914                        "numGenes": 0,
3915                        "outputFormats": ["TSV_VARIANT", "VCF"]
3916                    }
3917        - "transcript_source" (string):
3918            Transcript source (either "refseq", "ucsc", "ensembl")
3919            Default: "refseq"
3920        - "exomiser_to_info" (boolean):
3921            Add exomiser TSV file columns as INFO fields in VCF.
3922            Default: False
3923        - "release" (string):
3924            Exomise database release.
3925            If not exists, database release will be downloaded (take a while).
3926            Default: None (provided by application.properties configuration file)
3927        - "exomiser_application_properties" (file):
3928            Exomiser configuration file (see Exomiser docs).
3929            Useful to automatically download databases (especially for specific genome databases).
3930
3931        Notes:
3932        - If no sample in parameters, first sample in VCF will be chosen
3933        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
3934
3935        :param threads: The number of threads to use
3936        :return: None.
3937        """
3938
3939        # DEBUG
3940        log.debug("Start annotation with Exomiser databases")
3941
3942        # Threads
3943        if not threads:
3944            threads = self.get_threads()
3945        log.debug("Threads: " + str(threads))
3946
3947        # Config
3948        config = self.get_config()
3949        log.debug("Config: " + str(config))
3950
3951        # Config - Folders - Databases
3952        databases_folders = (
3953            config.get("folders", {})
3954            .get("databases", {})
3955            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
3956        )
3957        databases_folders = full_path(databases_folders)
3958        if not os.path.exists(databases_folders):
3959            log.error(f"Databases annotations: {databases_folders} NOT found")
3960        log.debug("Databases annotations: " + str(databases_folders))
3961
3962        # Config - Exomiser
3963        exomiser_bin_command = get_bin_command(
3964            bin="exomiser-cli*.jar",
3965            tool="exomiser",
3966            bin_type="jar",
3967            config=config,
3968            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
3969        )
3970        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
3971        if not exomiser_bin_command:
3972            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
3973            log.error(msg_err)
3974            raise ValueError(msg_err)
3975
3976        # Param
3977        param = self.get_param()
3978        log.debug("Param: " + str(param))
3979
3980        # Param - Exomiser
3981        param_exomiser = param.get("annotation", {}).get("exomiser", {})
3982        log.debug(f"Param Exomiser: {param_exomiser}")
3983
3984        # Param - Assembly
3985        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
3986        log.debug("Assembly: " + str(assembly))
3987
3988        # Data
3989        table_variants = self.get_table_variants()
3990
3991        # Check if not empty
3992        log.debug("Check if not empty")
3993        sql_query_chromosomes = (
3994            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3995        )
3996        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
3997            log.info(f"VCF empty")
3998            return False
3999
4000        # VCF header
4001        vcf_reader = self.get_header()
4002        log.debug("Initial header: " + str(vcf_reader.infos))
4003
4004        # Samples
4005        samples = self.get_header_sample_list()
4006        if not samples:
4007            log.error("No Samples in VCF")
4008            return False
4009        log.debug(f"Samples: {samples}")
4010
4011        # Memory limit
4012        memory_limit = self.get_memory("8G")
4013        log.debug(f"memory_limit: {memory_limit}")
4014
4015        # Exomiser java options
4016        exomiser_java_options = (
4017            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4018        )
4019        log.debug(f"Exomiser java options: {exomiser_java_options}")
4020
4021        # Download Exomiser (if not exists)
4022        exomiser_release = param_exomiser.get("release", None)
4023        exomiser_application_properties = param_exomiser.get(
4024            "exomiser_application_properties", None
4025        )
4026        databases_download_exomiser(
4027            assemblies=[assembly],
4028            exomiser_folder=databases_folders,
4029            exomiser_release=exomiser_release,
4030            exomiser_phenotype_release=exomiser_release,
4031            exomiser_application_properties=exomiser_application_properties,
4032        )
4033
4034        # Force annotation
4035        force_update_annotation = True
4036
4037        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4038            log.debug("Start annotation Exomiser")
4039
4040            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4041
4042                # tmp_dir = "/tmp/exomiser"
4043
4044                ### ANALYSIS ###
4045                ################
4046
4047                # Create analysis.json through analysis dict
4048                # either analysis in param or by default
4049                # depending on preset exome/genome)
4050
4051                # Init analysis dict
4052                param_exomiser_analysis_dict = {}
4053
4054                # analysis from param
4055                param_exomiser_analysis = param_exomiser.get("analysis", {})
4056                param_exomiser_analysis = full_path(param_exomiser_analysis)
4057
4058                # If analysis in param -> load anlaysis json
4059                if param_exomiser_analysis:
4060
4061                    # If param analysis is a file and exists
4062                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4063                        param_exomiser_analysis
4064                    ):
4065                        # Load analysis file into analysis dict (either yaml or json)
4066                        with open(param_exomiser_analysis) as json_file:
4067                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4068
4069                    # If param analysis is a dict
4070                    elif isinstance(param_exomiser_analysis, dict):
4071                        # Load analysis dict into analysis dict (either yaml or json)
4072                        param_exomiser_analysis_dict = param_exomiser_analysis
4073
4074                    # Error analysis type
4075                    else:
4076                        log.error(f"Analysis type unknown. Check param file.")
4077                        raise ValueError(f"Analysis type unknown. Check param file.")
4078
4079                # Case no input analysis config file/dict
4080                # Use preset (exome/genome) to open default config file
4081                if not param_exomiser_analysis_dict:
4082
4083                    # default preset
4084                    default_preset = "exome"
4085
4086                    # Get param preset or default preset
4087                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4088
4089                    # Try to find if preset is a file
4090                    if os.path.exists(param_exomiser_preset):
4091                        # Preset file is provided in full path
4092                        param_exomiser_analysis_default_config_file = (
4093                            param_exomiser_preset
4094                        )
4095                    # elif os.path.exists(full_path(param_exomiser_preset)):
4096                    #     # Preset file is provided in full path
4097                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4098                    elif os.path.exists(
4099                        os.path.join(folder_config, param_exomiser_preset)
4100                    ):
4101                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4102                        param_exomiser_analysis_default_config_file = os.path.join(
4103                            folder_config, param_exomiser_preset
4104                        )
4105                    else:
4106                        # Construct preset file
4107                        param_exomiser_analysis_default_config_file = os.path.join(
4108                            folder_config,
4109                            f"preset-{param_exomiser_preset}-analysis.json",
4110                        )
4111
4112                    # If preset file exists
4113                    param_exomiser_analysis_default_config_file = full_path(
4114                        param_exomiser_analysis_default_config_file
4115                    )
4116                    if os.path.exists(param_exomiser_analysis_default_config_file):
4117                        # Load prest file into analysis dict (either yaml or json)
4118                        with open(
4119                            param_exomiser_analysis_default_config_file
4120                        ) as json_file:
4121                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4122                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4123                                json_file
4124                            )
4125
4126                    # Error preset file
4127                    else:
4128                        log.error(
4129                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4130                        )
4131                        raise ValueError(
4132                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4133                        )
4134
4135                # If no analysis dict created
4136                if not param_exomiser_analysis_dict:
4137                    log.error(f"No analysis config")
4138                    raise ValueError(f"No analysis config")
4139
4140                # Log
4141                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4142
4143                ### PHENOPACKET ###
4144                ###################
4145
4146                # If no PhenoPacket in analysis dict -> check in param
4147                if "phenopacket" not in param_exomiser_analysis_dict:
4148
4149                    # If PhenoPacket in param -> load anlaysis json
4150                    if param_exomiser.get("phenopacket", None):
4151
4152                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4153                        param_exomiser_phenopacket = full_path(
4154                            param_exomiser_phenopacket
4155                        )
4156
4157                        # If param phenopacket is a file and exists
4158                        if isinstance(
4159                            param_exomiser_phenopacket, str
4160                        ) and os.path.exists(param_exomiser_phenopacket):
4161                            # Load phenopacket file into analysis dict (either yaml or json)
4162                            with open(param_exomiser_phenopacket) as json_file:
4163                                param_exomiser_analysis_dict["phenopacket"] = (
4164                                    yaml.safe_load(json_file)
4165                                )
4166
4167                        # If param phenopacket is a dict
4168                        elif isinstance(param_exomiser_phenopacket, dict):
4169                            # Load phenopacket dict into analysis dict (either yaml or json)
4170                            param_exomiser_analysis_dict["phenopacket"] = (
4171                                param_exomiser_phenopacket
4172                            )
4173
4174                        # Error phenopacket type
4175                        else:
4176                            log.error(f"Phenopacket type unknown. Check param file.")
4177                            raise ValueError(
4178                                f"Phenopacket type unknown. Check param file."
4179                            )
4180
4181                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4182                if "phenopacket" not in param_exomiser_analysis_dict:
4183
4184                    # Init PhenoPacket
4185                    param_exomiser_analysis_dict["phenopacket"] = {
4186                        "id": "analysis",
4187                        "proband": {},
4188                    }
4189
4190                    ### Add subject ###
4191
4192                    # If subject exists
4193                    param_exomiser_subject = param_exomiser.get("subject", {})
4194
4195                    # If subject not exists -> found sample ID
4196                    if not param_exomiser_subject:
4197
4198                        # Found sample ID in param
4199                        sample = param_exomiser.get("sample", None)
4200
4201                        # Find sample ID (first sample)
4202                        if not sample:
4203                            sample_list = self.get_header_sample_list()
4204                            if len(sample_list) > 0:
4205                                sample = sample_list[0]
4206                            else:
4207                                log.error(f"No sample found")
4208                                raise ValueError(f"No sample found")
4209
4210                        # Create subject
4211                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4212
4213                    # Add to dict
4214                    param_exomiser_analysis_dict["phenopacket"][
4215                        "subject"
4216                    ] = param_exomiser_subject
4217
4218                    ### Add "phenotypicFeatures" ###
4219
4220                    # If phenotypicFeatures exists
4221                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4222                        "phenotypicFeatures", []
4223                    )
4224
4225                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4226                    if not param_exomiser_phenotypicfeatures:
4227
4228                        # Found HPO in param
4229                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4230
4231                        # Split HPO if list in string format separated by comma
4232                        if isinstance(param_exomiser_hpo, str):
4233                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4234
4235                        # Create HPO list
4236                        for hpo in param_exomiser_hpo:
4237                            hpo_clean = re.sub("[^0-9]", "", hpo)
4238                            param_exomiser_phenotypicfeatures.append(
4239                                {
4240                                    "type": {
4241                                        "id": f"HP:{hpo_clean}",
4242                                        "label": f"HP:{hpo_clean}",
4243                                    }
4244                                }
4245                            )
4246
4247                    # Add to dict
4248                    param_exomiser_analysis_dict["phenopacket"][
4249                        "phenotypicFeatures"
4250                    ] = param_exomiser_phenotypicfeatures
4251
4252                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4253                    if not param_exomiser_phenotypicfeatures:
4254                        for step in param_exomiser_analysis_dict.get(
4255                            "analysis", {}
4256                        ).get("steps", []):
4257                            if "hiPhivePrioritiser" in step:
4258                                param_exomiser_analysis_dict.get("analysis", {}).get(
4259                                    "steps", []
4260                                ).remove(step)
4261
4262                ### Add Input File ###
4263
4264                # Initial file name and htsFiles
4265                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4266                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4267                    {
4268                        "uri": tmp_vcf_name,
4269                        "htsFormat": "VCF",
4270                        "genomeAssembly": assembly,
4271                    }
4272                ]
4273
4274                ### Add metaData ###
4275
4276                # If metaData not in analysis dict
4277                if "metaData" not in param_exomiser_analysis_dict:
4278                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4279                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4280                        "createdBy": "howard",
4281                        "phenopacketSchemaVersion": 1,
4282                    }
4283
4284                ### OutputOptions ###
4285
4286                # Init output result folder
4287                output_results = os.path.join(tmp_dir, "results")
4288
4289                # If no outputOptions in analysis dict
4290                if "outputOptions" not in param_exomiser_analysis_dict:
4291
4292                    # default output formats
4293                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4294
4295                    # Get outputOptions in param
4296                    output_options = param_exomiser.get("outputOptions", None)
4297
4298                    # If no output_options in param -> check
4299                    if not output_options:
4300                        output_options = {
4301                            "outputContributingVariantsOnly": False,
4302                            "numGenes": 0,
4303                            "outputFormats": defaut_output_formats,
4304                        }
4305
4306                    # Replace outputDirectory in output options
4307                    output_options["outputDirectory"] = output_results
4308                    output_options["outputFileName"] = "howard"
4309
4310                    # Add outputOptions in analysis dict
4311                    param_exomiser_analysis_dict["outputOptions"] = output_options
4312
4313                else:
4314
4315                    # Replace output_results and output format (if exists in param)
4316                    param_exomiser_analysis_dict["outputOptions"][
4317                        "outputDirectory"
4318                    ] = output_results
4319                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4320                        list(
4321                            set(
4322                                param_exomiser_analysis_dict.get(
4323                                    "outputOptions", {}
4324                                ).get("outputFormats", [])
4325                                + ["TSV_VARIANT", "VCF"]
4326                            )
4327                        )
4328                    )
4329
4330                # log
4331                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4332
4333                ### ANALYSIS FILE ###
4334                #####################
4335
4336                ### Full JSON analysis config file ###
4337
4338                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4339                with open(exomiser_analysis, "w") as fp:
4340                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4341
4342                ### SPLIT analysis and sample config files
4343
4344                # Splitted analysis dict
4345                param_exomiser_analysis_dict_for_split = (
4346                    param_exomiser_analysis_dict.copy()
4347                )
4348
4349                # Phenopacket JSON file
4350                exomiser_analysis_phenopacket = os.path.join(
4351                    tmp_dir, "analysis_phenopacket.json"
4352                )
4353                with open(exomiser_analysis_phenopacket, "w") as fp:
4354                    json.dump(
4355                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4356                        fp,
4357                        indent=4,
4358                    )
4359
4360                # Analysis JSON file without Phenopacket parameters
4361                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4362                exomiser_analysis_analysis = os.path.join(
4363                    tmp_dir, "analysis_analysis.json"
4364                )
4365                with open(exomiser_analysis_analysis, "w") as fp:
4366                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4367
4368                ### INITAL VCF file ###
4369                #######################
4370
4371                ### Create list of samples to use and include inti initial VCF file ####
4372
4373                # Subject (main sample)
4374                # Get sample ID in analysis dict
4375                sample_subject = (
4376                    param_exomiser_analysis_dict.get("phenopacket", {})
4377                    .get("subject", {})
4378                    .get("id", None)
4379                )
4380                sample_proband = (
4381                    param_exomiser_analysis_dict.get("phenopacket", {})
4382                    .get("proband", {})
4383                    .get("subject", {})
4384                    .get("id", None)
4385                )
4386                sample = []
4387                if sample_subject:
4388                    sample.append(sample_subject)
4389                if sample_proband:
4390                    sample.append(sample_proband)
4391
4392                # Get sample ID within Pedigree
4393                pedigree_persons_list = (
4394                    param_exomiser_analysis_dict.get("phenopacket", {})
4395                    .get("pedigree", {})
4396                    .get("persons", {})
4397                )
4398
4399                # Create list with all sample ID in pedigree (if exists)
4400                pedigree_persons = []
4401                for person in pedigree_persons_list:
4402                    pedigree_persons.append(person.get("individualId"))
4403
4404                # Concat subject sample ID and samples ID in pedigreesamples
4405                samples = list(set(sample + pedigree_persons))
4406
4407                # Check if sample list is not empty
4408                if not samples:
4409                    log.error(f"No samples found")
4410                    raise ValueError(f"No samples found")
4411
4412                # Create VCF with sample (either sample in param or first one by default)
4413                # Export VCF file
4414                self.export_variant_vcf(
4415                    vcf_file=tmp_vcf_name,
4416                    remove_info=True,
4417                    add_samples=True,
4418                    list_samples=samples,
4419                    index=False,
4420                )
4421
4422                ### Execute Exomiser ###
4423                ########################
4424
4425                # Init command
4426                exomiser_command = ""
4427
4428                # Command exomiser options
4429                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4430
4431                # Release
4432                exomiser_release = param_exomiser.get("release", None)
4433                if exomiser_release:
4434                    # phenotype data version
4435                    exomiser_options += (
4436                        f" --exomiser.phenotype.data-version={exomiser_release} "
4437                    )
4438                    # data version
4439                    exomiser_options += (
4440                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4441                    )
4442                    # variant white list
4443                    variant_white_list_file = (
4444                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4445                    )
4446                    if os.path.exists(
4447                        os.path.join(
4448                            databases_folders, assembly, variant_white_list_file
4449                        )
4450                    ):
4451                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4452
4453                # transcript_source
4454                transcript_source = param_exomiser.get(
4455                    "transcript_source", None
4456                )  # ucsc, refseq, ensembl
4457                if transcript_source:
4458                    exomiser_options += (
4459                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4460                    )
4461
4462                # If analysis contain proband param
4463                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4464                    "proband", {}
4465                ):
4466                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4467
4468                # If no proband (usually uniq sample)
4469                else:
4470                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4471
4472                # Log
4473                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4474
4475                # Run command
4476                result = subprocess.call(
4477                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4478                )
4479                if result:
4480                    log.error("Exomiser command failed")
4481                    raise ValueError("Exomiser command failed")
4482
4483                ### RESULTS ###
4484                ###############
4485
4486                ### Annotate with TSV fields ###
4487
4488                # Init result tsv file
4489                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4490
4491                # Init result tsv file
4492                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4493
4494                # Parse TSV file and explode columns in INFO field
4495                if exomiser_to_info and os.path.exists(output_results_tsv):
4496
4497                    # Log
4498                    log.debug("Exomiser columns to VCF INFO field")
4499
4500                    # Retrieve columns and types
4501                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4502                    output_results_tsv_df = self.get_query_to_df(query)
4503                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4504
4505                    # Init concat fields for update
4506                    sql_query_update_concat_fields = []
4507
4508                    # Fields to avoid
4509                    fields_to_avoid = [
4510                        "CONTIG",
4511                        "START",
4512                        "END",
4513                        "REF",
4514                        "ALT",
4515                        "QUAL",
4516                        "FILTER",
4517                        "GENOTYPE",
4518                    ]
4519
4520                    # List all columns to add into header
4521                    for header_column in output_results_tsv_columns:
4522
4523                        # If header column is enable
4524                        if header_column not in fields_to_avoid:
4525
4526                            # Header info type
4527                            header_info_type = "String"
4528                            header_column_df = output_results_tsv_df[header_column]
4529                            header_column_df_dtype = header_column_df.dtype
4530                            if header_column_df_dtype == object:
4531                                if (
4532                                    pd.to_numeric(header_column_df, errors="coerce")
4533                                    .notnull()
4534                                    .all()
4535                                ):
4536                                    header_info_type = "Float"
4537                            else:
4538                                header_info_type = "Integer"
4539
4540                            # Header info
4541                            characters_to_validate = ["-"]
4542                            pattern = "[" + "".join(characters_to_validate) + "]"
4543                            header_info_name = re.sub(
4544                                pattern,
4545                                "_",
4546                                f"Exomiser_{header_column}".replace("#", ""),
4547                            )
4548                            header_info_number = "."
4549                            header_info_description = (
4550                                f"Exomiser {header_column} annotation"
4551                            )
4552                            header_info_source = "Exomiser"
4553                            header_info_version = "unknown"
4554                            header_info_code = CODE_TYPE_MAP[header_info_type]
4555                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4556                                header_info_name,
4557                                header_info_number,
4558                                header_info_type,
4559                                header_info_description,
4560                                header_info_source,
4561                                header_info_version,
4562                                header_info_code,
4563                            )
4564
4565                            # Add field to add for update to concat fields
4566                            sql_query_update_concat_fields.append(
4567                                f"""
4568                                CASE
4569                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4570                                    THEN concat(
4571                                        '{header_info_name}=',
4572                                        table_parquet."{header_column}",
4573                                        ';'
4574                                        )
4575
4576                                    ELSE ''
4577                                END
4578                            """
4579                            )
4580
4581                    # Update query
4582                    sql_query_update = f"""
4583                        UPDATE {table_variants} as table_variants
4584                            SET INFO = concat(
4585                                            CASE
4586                                                WHEN INFO NOT IN ('', '.')
4587                                                THEN INFO
4588                                                ELSE ''
4589                                            END,
4590                                            CASE
4591                                                WHEN table_variants.INFO NOT IN ('','.')
4592                                                THEN ';'
4593                                                ELSE ''
4594                                            END,
4595                                            (
4596                                            SELECT 
4597                                                concat(
4598                                                    {",".join(sql_query_update_concat_fields)}
4599                                                )
4600                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4601                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4602                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4603                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4604                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4605                                            )
4606                                        )
4607                            ;
4608                        """
4609
4610                    # Update
4611                    self.conn.execute(sql_query_update)
4612
4613                ### Annotate with VCF INFO field ###
4614
4615                # Init result VCF file
4616                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4617
4618                # If VCF exists
4619                if os.path.exists(output_results_vcf):
4620
4621                    # Log
4622                    log.debug("Exomiser result VCF update variants")
4623
4624                    # Find Exomiser INFO field annotation in header
4625                    with gzip.open(output_results_vcf, "rt") as f:
4626                        header_list = self.read_vcf_header(f)
4627                    exomiser_vcf_header = vcf.Reader(
4628                        io.StringIO("\n".join(header_list))
4629                    )
4630
4631                    # Add annotation INFO field to header
4632                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4633
4634                    # Update variants with VCF
4635                    self.update_from_vcf(output_results_vcf)
4636
4637        return True
4638
4639    def annotation_snpeff(self, threads: int = None) -> None:
4640        """
4641        This function annotate with snpEff
4642
4643        :param threads: The number of threads to use
4644        :return: the value of the variable "return_value".
4645        """
4646
4647        # DEBUG
4648        log.debug("Start annotation with snpeff databases")
4649
4650        # Threads
4651        if not threads:
4652            threads = self.get_threads()
4653        log.debug("Threads: " + str(threads))
4654
4655        # DEBUG
4656        delete_tmp = True
4657        if self.get_config().get("verbosity", "warning") in ["debug"]:
4658            delete_tmp = False
4659            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4660
4661        # Config
4662        config = self.get_config()
4663        log.debug("Config: " + str(config))
4664
4665        # Config - Folders - Databases
4666        databases_folders = (
4667            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4668        )
4669        log.debug("Databases annotations: " + str(databases_folders))
4670
4671        # # Config - Java
4672        # java_bin = get_bin(
4673        #     tool="java",
4674        #     bin="java",
4675        #     bin_type="bin",
4676        #     config=config,
4677        #     default_folder="/usr/bin",
4678        # )
4679        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4680        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4681        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4682
4683        # # Config - snpEff bin
4684        # snpeff_jar = get_bin(
4685        #     tool="snpeff",
4686        #     bin="snpEff.jar",
4687        #     bin_type="jar",
4688        #     config=config,
4689        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4690        # )
4691        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4692        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4693        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4694
4695        # Config - snpEff bin command
4696        snpeff_bin_command = get_bin_command(
4697            bin="snpEff.jar",
4698            tool="snpeff",
4699            bin_type="jar",
4700            config=config,
4701            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4702        )
4703        if not snpeff_bin_command:
4704            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4705            log.error(msg_err)
4706            raise ValueError(msg_err)
4707
4708        # Config - snpEff databases
4709        snpeff_databases = (
4710            config.get("folders", {})
4711            .get("databases", {})
4712            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4713        )
4714        snpeff_databases = full_path(snpeff_databases)
4715        if snpeff_databases is not None and snpeff_databases != "":
4716            log.debug(f"Create snpEff databases folder")
4717            if not os.path.exists(snpeff_databases):
4718                os.makedirs(snpeff_databases)
4719
4720        # Param
4721        param = self.get_param()
4722        log.debug("Param: " + str(param))
4723
4724        # Param
4725        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4726        log.debug("Options: " + str(options))
4727
4728        # Param - Assembly
4729        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4730
4731        # Param - Options
4732        snpeff_options = (
4733            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4734        )
4735        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4736        snpeff_csvstats = (
4737            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4738        )
4739        if snpeff_stats:
4740            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4741            snpeff_stats = full_path(snpeff_stats)
4742            snpeff_options += f" -stats {snpeff_stats}"
4743        if snpeff_csvstats:
4744            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4745            snpeff_csvstats = full_path(snpeff_csvstats)
4746            snpeff_options += f" -csvStats {snpeff_csvstats}"
4747
4748        # Data
4749        table_variants = self.get_table_variants()
4750
4751        # Check if not empty
4752        log.debug("Check if not empty")
4753        sql_query_chromosomes = (
4754            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4755        )
4756        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4757        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4758            log.info(f"VCF empty")
4759            return
4760
4761        # Export in VCF
4762        log.debug("Create initial file to annotate")
4763        tmp_vcf = NamedTemporaryFile(
4764            prefix=self.get_prefix(),
4765            dir=self.get_tmp_dir(),
4766            suffix=".vcf.gz",
4767            delete=True,
4768        )
4769        tmp_vcf_name = tmp_vcf.name
4770
4771        # VCF header
4772        vcf_reader = self.get_header()
4773        log.debug("Initial header: " + str(vcf_reader.infos))
4774
4775        # Existing annotations
4776        for vcf_annotation in self.get_header().infos:
4777
4778            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4779            log.debug(
4780                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4781            )
4782
4783        # Memory limit
4784        # if config.get("memory", None):
4785        #     memory_limit = config.get("memory", "8G")
4786        # else:
4787        #     memory_limit = "8G"
4788        memory_limit = self.get_memory("8G")
4789        log.debug(f"memory_limit: {memory_limit}")
4790
4791        # snpEff java options
4792        snpeff_java_options = (
4793            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4794        )
4795        log.debug(f"Exomiser java options: {snpeff_java_options}")
4796
4797        force_update_annotation = True
4798
4799        if "ANN" not in self.get_header().infos or force_update_annotation:
4800
4801            # Check snpEff database
4802            log.debug(f"Check snpEff databases {[assembly]}")
4803            databases_download_snpeff(
4804                folder=snpeff_databases, assemblies=[assembly], config=config
4805            )
4806
4807            # Export VCF file
4808            self.export_variant_vcf(
4809                vcf_file=tmp_vcf_name,
4810                remove_info=True,
4811                add_samples=False,
4812                index=True,
4813            )
4814
4815            # Tmp file
4816            err_files = []
4817            tmp_annotate_vcf = NamedTemporaryFile(
4818                prefix=self.get_prefix(),
4819                dir=self.get_tmp_dir(),
4820                suffix=".vcf",
4821                delete=False,
4822            )
4823            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4824            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4825            err_files.append(tmp_annotate_vcf_name_err)
4826
4827            # Command
4828            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4829            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4830            run_parallel_commands([snpeff_command], 1)
4831
4832            # Error messages
4833            log.info(f"Error/Warning messages:")
4834            error_message_command_all = []
4835            error_message_command_warning = []
4836            error_message_command_err = []
4837            for err_file in err_files:
4838                with open(err_file, "r") as f:
4839                    for line in f:
4840                        message = line.strip()
4841                        error_message_command_all.append(message)
4842                        if line.startswith("[W::"):
4843                            error_message_command_warning.append(message)
4844                        if line.startswith("[E::"):
4845                            error_message_command_err.append(f"{err_file}: " + message)
4846            # log info
4847            for message in list(
4848                set(error_message_command_err + error_message_command_warning)
4849            ):
4850                log.info(f"   {message}")
4851            # debug info
4852            for message in list(set(error_message_command_all)):
4853                log.debug(f"   {message}")
4854            # failed
4855            if len(error_message_command_err):
4856                log.error("Annotation failed: Error in commands")
4857                raise ValueError("Annotation failed: Error in commands")
4858
4859            # Find annotation in header
4860            with open(tmp_annotate_vcf_name, "rt") as f:
4861                header_list = self.read_vcf_header(f)
4862            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4863
4864            for ann in annovar_vcf_header.infos:
4865                if ann not in self.get_header().infos:
4866                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4867
4868            # Update variants
4869            log.info(f"Annotation - Updating...")
4870            self.update_from_vcf(tmp_annotate_vcf_name)
4871
4872        else:
4873            if "ANN" in self.get_header().infos:
4874                log.debug(f"Existing snpEff annotations in VCF")
4875            if force_update_annotation:
4876                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
4877
4878    def annotation_annovar(self, threads: int = None) -> None:
4879        """
4880        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4881        annotations
4882
4883        :param threads: number of threads to use
4884        :return: the value of the variable "return_value".
4885        """
4886
4887        # DEBUG
4888        log.debug("Start annotation with Annovar databases")
4889
4890        # Threads
4891        if not threads:
4892            threads = self.get_threads()
4893        log.debug("Threads: " + str(threads))
4894
4895        # Tmp en Err files
4896        tmp_files = []
4897        err_files = []
4898
4899        # DEBUG
4900        delete_tmp = True
4901        if self.get_config().get("verbosity", "warning") in ["debug"]:
4902            delete_tmp = False
4903            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4904
4905        # Config
4906        config = self.get_config()
4907        log.debug("Config: " + str(config))
4908
4909        # Config - Folders - Databases
4910        databases_folders = (
4911            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4912        )
4913        log.debug("Databases annotations: " + str(databases_folders))
4914
4915        # Config - annovar bin command
4916        annovar_bin_command = get_bin_command(
4917            bin="table_annovar.pl",
4918            tool="annovar",
4919            bin_type="perl",
4920            config=config,
4921            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
4922        )
4923        if not annovar_bin_command:
4924            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
4925            log.error(msg_err)
4926            raise ValueError(msg_err)
4927
4928        # Config - BCFTools bin command
4929        bcftools_bin_command = get_bin_command(
4930            bin="bcftools",
4931            tool="bcftools",
4932            bin_type="bin",
4933            config=config,
4934            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
4935        )
4936        if not bcftools_bin_command:
4937            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
4938            log.error(msg_err)
4939            raise ValueError(msg_err)
4940
4941        # Config - annovar databases
4942        annovar_databases = (
4943            config.get("folders", {})
4944            .get("databases", {})
4945            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
4946        )
4947        annovar_databases = full_path(annovar_databases)
4948        if annovar_databases != "" and not os.path.exists(annovar_databases):
4949            os.makedirs(annovar_databases)
4950
4951        # Param
4952        param = self.get_param()
4953        log.debug("Param: " + str(param))
4954
4955        # Param - options
4956        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
4957        log.debug("Options: " + str(options))
4958
4959        # Param - annotations
4960        annotations = (
4961            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
4962        )
4963        log.debug("Annotations: " + str(annotations))
4964
4965        # Param - Assembly
4966        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4967
4968        # Annovar database assembly
4969        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
4970        if annovar_databases_assembly != "" and not os.path.exists(
4971            annovar_databases_assembly
4972        ):
4973            os.makedirs(annovar_databases_assembly)
4974
4975        # Data
4976        table_variants = self.get_table_variants()
4977
4978        # Check if not empty
4979        log.debug("Check if not empty")
4980        sql_query_chromosomes = (
4981            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4982        )
4983        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
4984        if not sql_query_chromosomes_df["count"][0]:
4985            log.info(f"VCF empty")
4986            return
4987
4988        # VCF header
4989        vcf_reader = self.get_header()
4990        log.debug("Initial header: " + str(vcf_reader.infos))
4991
4992        # Existing annotations
4993        for vcf_annotation in self.get_header().infos:
4994
4995            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4996            log.debug(
4997                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4998            )
4999
5000        force_update_annotation = True
5001
5002        if annotations:
5003
5004            commands = []
5005            tmp_annotates_vcf_name_list = []
5006
5007            # Export in VCF
5008            log.debug("Create initial file to annotate")
5009            tmp_vcf = NamedTemporaryFile(
5010                prefix=self.get_prefix(),
5011                dir=self.get_tmp_dir(),
5012                suffix=".vcf.gz",
5013                delete=False,
5014            )
5015            tmp_vcf_name = tmp_vcf.name
5016            tmp_files.append(tmp_vcf_name)
5017            tmp_files.append(tmp_vcf_name + ".tbi")
5018
5019            # Export VCF file
5020            self.export_variant_vcf(
5021                vcf_file=tmp_vcf_name,
5022                remove_info=".",
5023                add_samples=False,
5024                index=True,
5025            )
5026
5027            # Create file for field rename
5028            log.debug("Create file for field rename")
5029            tmp_rename = NamedTemporaryFile(
5030                prefix=self.get_prefix(),
5031                dir=self.get_tmp_dir(),
5032                suffix=".rename",
5033                delete=False,
5034            )
5035            tmp_rename_name = tmp_rename.name
5036            tmp_files.append(tmp_rename_name)
5037
5038            # Check Annovar database
5039            log.debug(
5040                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5041            )
5042            databases_download_annovar(
5043                folder=annovar_databases,
5044                files=list(annotations.keys()),
5045                assemblies=[assembly],
5046            )
5047
5048            for annotation in annotations:
5049                annotation_fields = annotations[annotation]
5050
5051                if not annotation_fields:
5052                    annotation_fields = {"INFO": None}
5053
5054                log.info(f"Annotations Annovar - database '{annotation}'")
5055                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5056
5057                # Tmp file for annovar
5058                err_files = []
5059                tmp_annotate_vcf_directory = TemporaryDirectory(
5060                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5061                )
5062                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5063                tmp_annotate_vcf_name_annovar = (
5064                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5065                )
5066                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5067                err_files.append(tmp_annotate_vcf_name_err)
5068                tmp_files.append(tmp_annotate_vcf_name_err)
5069
5070                # Tmp file final vcf annotated by annovar
5071                tmp_annotate_vcf = NamedTemporaryFile(
5072                    prefix=self.get_prefix(),
5073                    dir=self.get_tmp_dir(),
5074                    suffix=".vcf.gz",
5075                    delete=False,
5076                )
5077                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5078                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5079                tmp_files.append(tmp_annotate_vcf_name)
5080                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5081
5082                # Number of fields
5083                annotation_list = []
5084                annotation_renamed_list = []
5085
5086                for annotation_field in annotation_fields:
5087
5088                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5089                    annotation_fields_new_name = annotation_fields.get(
5090                        annotation_field, annotation_field
5091                    )
5092                    if not annotation_fields_new_name:
5093                        annotation_fields_new_name = annotation_field
5094
5095                    if (
5096                        force_update_annotation
5097                        or annotation_fields_new_name not in self.get_header().infos
5098                    ):
5099                        annotation_list.append(annotation_field)
5100                        annotation_renamed_list.append(annotation_fields_new_name)
5101                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5102                        log.warning(
5103                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5104                        )
5105
5106                    # Add rename info
5107                    run_parallel_commands(
5108                        [
5109                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5110                        ],
5111                        1,
5112                    )
5113
5114                # log.debug("fields_to_removed: " + str(fields_to_removed))
5115                log.debug("annotation_list: " + str(annotation_list))
5116
5117                # protocol
5118                protocol = annotation
5119
5120                # argument
5121                argument = ""
5122
5123                # operation
5124                operation = "f"
5125                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5126                    "ensGene"
5127                ):
5128                    operation = "g"
5129                    if options.get("genebase", None):
5130                        argument = f"""'{options.get("genebase","")}'"""
5131                elif annotation in ["cytoBand"]:
5132                    operation = "r"
5133
5134                # argument option
5135                argument_option = ""
5136                if argument != "":
5137                    argument_option = " --argument " + argument
5138
5139                # command options
5140                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5141                for option in options:
5142                    if option not in ["genebase"]:
5143                        command_options += f""" --{option}={options[option]}"""
5144
5145                # Command
5146
5147                # Command - Annovar
5148                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5149                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5150
5151                # Command - start pipe
5152                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5153
5154                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5155                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5156
5157                # Command - Special characters (refGene annotation)
5158                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5159
5160                # Command - Clean empty fields (with value ".")
5161                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5162
5163                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5164                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5165                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5166                    # for ann in annotation_renamed_list:
5167                    for ann in annotation_list:
5168                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5169
5170                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5171
5172                # Command - indexing
5173                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5174
5175                log.debug(f"Annotation - Annovar command: {command_annovar}")
5176                run_parallel_commands([command_annovar], 1)
5177
5178                # Error messages
5179                log.info(f"Error/Warning messages:")
5180                error_message_command_all = []
5181                error_message_command_warning = []
5182                error_message_command_err = []
5183                for err_file in err_files:
5184                    with open(err_file, "r") as f:
5185                        for line in f:
5186                            message = line.strip()
5187                            error_message_command_all.append(message)
5188                            if line.startswith("[W::") or line.startswith("WARNING"):
5189                                error_message_command_warning.append(message)
5190                            if line.startswith("[E::") or line.startswith("ERROR"):
5191                                error_message_command_err.append(
5192                                    f"{err_file}: " + message
5193                                )
5194                # log info
5195                for message in list(
5196                    set(error_message_command_err + error_message_command_warning)
5197                ):
5198                    log.info(f"   {message}")
5199                # debug info
5200                for message in list(set(error_message_command_all)):
5201                    log.debug(f"   {message}")
5202                # failed
5203                if len(error_message_command_err):
5204                    log.error("Annotation failed: Error in commands")
5205                    raise ValueError("Annotation failed: Error in commands")
5206
5207            if tmp_annotates_vcf_name_list:
5208
5209                # List of annotated files
5210                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5211
5212                # Tmp file
5213                tmp_annotate_vcf = NamedTemporaryFile(
5214                    prefix=self.get_prefix(),
5215                    dir=self.get_tmp_dir(),
5216                    suffix=".vcf.gz",
5217                    delete=False,
5218                )
5219                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5220                tmp_files.append(tmp_annotate_vcf_name)
5221                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5222                err_files.append(tmp_annotate_vcf_name_err)
5223                tmp_files.append(tmp_annotate_vcf_name_err)
5224
5225                # Command merge
5226                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5227                log.info(
5228                    f"Annotation Annovar - Annotation merging "
5229                    + str(len(tmp_annotates_vcf_name_list))
5230                    + " annotated files"
5231                )
5232                log.debug(f"Annotation - merge command: {merge_command}")
5233                run_parallel_commands([merge_command], 1)
5234
5235                # Find annotation in header
5236                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5237                    header_list = self.read_vcf_header(f)
5238                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5239
5240                for ann in annovar_vcf_header.infos:
5241                    if ann not in self.get_header().infos:
5242                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5243
5244                # Update variants
5245                log.info(f"Annotation Annovar - Updating...")
5246                self.update_from_vcf(tmp_annotate_vcf_name)
5247
5248            # Clean files
5249            # Tmp file remove command
5250            if True:
5251                tmp_files_remove_command = ""
5252                if tmp_files:
5253                    tmp_files_remove_command = " ".join(tmp_files)
5254                clean_command = f" rm -f {tmp_files_remove_command} "
5255                log.debug(f"Annotation Annovar - Annotation cleaning ")
5256                log.debug(f"Annotation - cleaning command: {clean_command}")
5257                run_parallel_commands([clean_command], 1)
5258
5259    # Parquet
5260    def annotation_parquet(self, threads: int = None) -> None:
5261        """
5262        It takes a VCF file, and annotates it with a parquet file
5263
5264        :param threads: number of threads to use for the annotation
5265        :return: the value of the variable "result".
5266        """
5267
5268        # DEBUG
5269        log.debug("Start annotation with parquet databases")
5270
5271        # Threads
5272        if not threads:
5273            threads = self.get_threads()
5274        log.debug("Threads: " + str(threads))
5275
5276        # DEBUG
5277        delete_tmp = True
5278        if self.get_config().get("verbosity", "warning") in ["debug"]:
5279            delete_tmp = False
5280            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5281
5282        # Config
5283        databases_folders = set(
5284            self.get_config()
5285            .get("folders", {})
5286            .get("databases", {})
5287            .get("annotations", ["."])
5288            + self.get_config()
5289            .get("folders", {})
5290            .get("databases", {})
5291            .get("parquet", ["."])
5292        )
5293        log.debug("Databases annotations: " + str(databases_folders))
5294
5295        # Param
5296        annotations = (
5297            self.get_param()
5298            .get("annotation", {})
5299            .get("parquet", {})
5300            .get("annotations", None)
5301        )
5302        log.debug("Annotations: " + str(annotations))
5303
5304        # Assembly
5305        assembly = self.get_param().get(
5306            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5307        )
5308
5309        # Force Update Annotation
5310        force_update_annotation = (
5311            self.get_param()
5312            .get("annotation", {})
5313            .get("options", {})
5314            .get("annotations_update", False)
5315        )
5316        log.debug(f"force_update_annotation={force_update_annotation}")
5317        force_append_annotation = (
5318            self.get_param()
5319            .get("annotation", {})
5320            .get("options", {})
5321            .get("annotations_append", False)
5322        )
5323        log.debug(f"force_append_annotation={force_append_annotation}")
5324
5325        # Data
5326        table_variants = self.get_table_variants()
5327
5328        # Check if not empty
5329        log.debug("Check if not empty")
5330        sql_query_chromosomes_df = self.get_query_to_df(
5331            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5332        )
5333        if not sql_query_chromosomes_df["count"][0]:
5334            log.info(f"VCF empty")
5335            return
5336
5337        # VCF header
5338        vcf_reader = self.get_header()
5339        log.debug("Initial header: " + str(vcf_reader.infos))
5340
5341        # Nb Variants POS
5342        log.debug("NB Variants Start")
5343        nb_variants = self.conn.execute(
5344            f"SELECT count(*) AS count FROM variants"
5345        ).fetchdf()["count"][0]
5346        log.debug("NB Variants Stop")
5347
5348        # Existing annotations
5349        for vcf_annotation in self.get_header().infos:
5350
5351            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5352            log.debug(
5353                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5354            )
5355
5356        # Added columns
5357        added_columns = []
5358
5359        # drop indexes
5360        log.debug(f"Drop indexes...")
5361        self.drop_indexes()
5362
5363        if annotations:
5364
5365            if "ALL" in annotations:
5366
5367                all_param = annotations.get("ALL", {})
5368                all_param_formats = all_param.get("formats", None)
5369                all_param_releases = all_param.get("releases", None)
5370
5371                databases_infos_dict = self.scan_databases(
5372                    database_formats=all_param_formats,
5373                    database_releases=all_param_releases,
5374                )
5375                for database_infos in databases_infos_dict.keys():
5376                    if database_infos not in annotations:
5377                        annotations[database_infos] = {"INFO": None}
5378
5379            for annotation in annotations:
5380
5381                if annotation in ["ALL"]:
5382                    continue
5383
5384                # Annotation Name
5385                annotation_name = os.path.basename(annotation)
5386
5387                # Annotation fields
5388                annotation_fields = annotations[annotation]
5389                if not annotation_fields:
5390                    annotation_fields = {"INFO": None}
5391
5392                log.debug(f"Annotation '{annotation_name}'")
5393                log.debug(
5394                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5395                )
5396
5397                # Create Database
5398                database = Database(
5399                    database=annotation,
5400                    databases_folders=databases_folders,
5401                    assembly=assembly,
5402                )
5403
5404                # Find files
5405                parquet_file = database.get_database()
5406                parquet_hdr_file = database.get_header_file()
5407                parquet_type = database.get_type()
5408
5409                # Check if files exists
5410                if not parquet_file or not parquet_hdr_file:
5411                    log.error("Annotation failed: file not found")
5412                    raise ValueError("Annotation failed: file not found")
5413                else:
5414                    # Get parquet connexion
5415                    parquet_sql_attach = database.get_sql_database_attach(
5416                        output="query"
5417                    )
5418                    if parquet_sql_attach:
5419                        self.conn.execute(parquet_sql_attach)
5420                    parquet_file_link = database.get_sql_database_link()
5421                    # Log
5422                    log.debug(
5423                        f"Annotation '{annotation_name}' - file: "
5424                        + str(parquet_file)
5425                        + " and "
5426                        + str(parquet_hdr_file)
5427                    )
5428
5429                    # Database full header columns
5430                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5431                        parquet_hdr_file
5432                    )
5433                    # Log
5434                    log.debug(
5435                        "Annotation database header columns : "
5436                        + str(parquet_hdr_vcf_header_columns)
5437                    )
5438
5439                    # Load header as VCF object
5440                    parquet_hdr_vcf_header_infos = database.get_header().infos
5441                    # Log
5442                    log.debug(
5443                        "Annotation database header: "
5444                        + str(parquet_hdr_vcf_header_infos)
5445                    )
5446
5447                    # Get extra infos
5448                    parquet_columns = database.get_extra_columns()
5449                    # Log
5450                    log.debug("Annotation database Columns: " + str(parquet_columns))
5451
5452                    # Add extra columns if "ALL" in annotation_fields
5453                    # if "ALL" in annotation_fields:
5454                    #     allow_add_extra_column = True
5455                    if "ALL" in annotation_fields and database.get_extra_columns():
5456                        for extra_column in database.get_extra_columns():
5457                            if (
5458                                extra_column not in annotation_fields
5459                                and extra_column.replace("INFO/", "")
5460                                not in parquet_hdr_vcf_header_infos
5461                            ):
5462                                parquet_hdr_vcf_header_infos[extra_column] = (
5463                                    vcf.parser._Info(
5464                                        extra_column,
5465                                        ".",
5466                                        "String",
5467                                        f"{extra_column} description",
5468                                        "unknown",
5469                                        "unknown",
5470                                        self.code_type_map["String"],
5471                                    )
5472                                )
5473
5474                    # For all fields in database
5475                    annotation_fields_all = False
5476                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5477                        annotation_fields_all = True
5478                        annotation_fields = {
5479                            key: key for key in parquet_hdr_vcf_header_infos
5480                        }
5481
5482                        log.debug(
5483                            "Annotation database header - All annotations added: "
5484                            + str(annotation_fields)
5485                        )
5486
5487                    # Init
5488
5489                    # List of annotation fields to use
5490                    sql_query_annotation_update_info_sets = []
5491
5492                    # List of annotation to agregate
5493                    sql_query_annotation_to_agregate = []
5494
5495                    # Number of fields
5496                    nb_annotation_field = 0
5497
5498                    # Annotation fields processed
5499                    annotation_fields_processed = []
5500
5501                    # Columns mapping
5502                    map_columns = database.map_columns(
5503                        columns=annotation_fields, prefixes=["INFO/"]
5504                    )
5505
5506                    # Query dict for fields to remove (update option)
5507                    query_dict_remove = {}
5508
5509                    # Fetch Anotation fields
5510                    for annotation_field in annotation_fields:
5511
5512                        # annotation_field_column
5513                        annotation_field_column = map_columns.get(
5514                            annotation_field, "INFO"
5515                        )
5516
5517                        # field new name, if parametered
5518                        annotation_fields_new_name = annotation_fields.get(
5519                            annotation_field, annotation_field
5520                        )
5521                        if not annotation_fields_new_name:
5522                            annotation_fields_new_name = annotation_field
5523
5524                        # To annotate
5525                        # force_update_annotation = True
5526                        # force_append_annotation = True
5527                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5528                        if annotation_field in parquet_hdr_vcf_header_infos and (
5529                            force_update_annotation
5530                            or force_append_annotation
5531                            or (
5532                                annotation_fields_new_name
5533                                not in self.get_header().infos
5534                            )
5535                        ):
5536
5537                            # Add field to annotation to process list
5538                            annotation_fields_processed.append(
5539                                annotation_fields_new_name
5540                            )
5541
5542                            # explode infos for the field
5543                            annotation_fields_new_name_info_msg = ""
5544                            if (
5545                                force_update_annotation
5546                                and annotation_fields_new_name
5547                                in self.get_header().infos
5548                            ):
5549                                # Remove field from INFO
5550                                query = f"""
5551                                    UPDATE {table_variants} as table_variants
5552                                    SET INFO = REGEXP_REPLACE(
5553                                                concat(table_variants.INFO,''),
5554                                                ';*{annotation_fields_new_name}=[^;]*',
5555                                                ''
5556                                                )
5557                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5558                                """
5559                                annotation_fields_new_name_info_msg = " [update]"
5560                                query_dict_remove[
5561                                    f"remove 'INFO/{annotation_fields_new_name}'"
5562                                ] = query
5563
5564                            # Sep between fields in INFO
5565                            nb_annotation_field += 1
5566                            if nb_annotation_field > 1:
5567                                annotation_field_sep = ";"
5568                            else:
5569                                annotation_field_sep = ""
5570
5571                            log.info(
5572                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5573                            )
5574
5575                            # Add INFO field to header
5576                            parquet_hdr_vcf_header_infos_number = (
5577                                parquet_hdr_vcf_header_infos[annotation_field].num
5578                                or "."
5579                            )
5580                            parquet_hdr_vcf_header_infos_type = (
5581                                parquet_hdr_vcf_header_infos[annotation_field].type
5582                                or "String"
5583                            )
5584                            parquet_hdr_vcf_header_infos_description = (
5585                                parquet_hdr_vcf_header_infos[annotation_field].desc
5586                                or f"{annotation_field} description"
5587                            )
5588                            parquet_hdr_vcf_header_infos_source = (
5589                                parquet_hdr_vcf_header_infos[annotation_field].source
5590                                or "unknown"
5591                            )
5592                            parquet_hdr_vcf_header_infos_version = (
5593                                parquet_hdr_vcf_header_infos[annotation_field].version
5594                                or "unknown"
5595                            )
5596
5597                            vcf_reader.infos[annotation_fields_new_name] = (
5598                                vcf.parser._Info(
5599                                    annotation_fields_new_name,
5600                                    parquet_hdr_vcf_header_infos_number,
5601                                    parquet_hdr_vcf_header_infos_type,
5602                                    parquet_hdr_vcf_header_infos_description,
5603                                    parquet_hdr_vcf_header_infos_source,
5604                                    parquet_hdr_vcf_header_infos_version,
5605                                    self.code_type_map[
5606                                        parquet_hdr_vcf_header_infos_type
5607                                    ],
5608                                )
5609                            )
5610
5611                            # Append
5612                            if force_append_annotation:
5613                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5614                            else:
5615                                query_case_when_append = ""
5616
5617                            # Annotation/Update query fields
5618                            # Found in INFO column
5619                            if (
5620                                annotation_field_column == "INFO"
5621                                and "INFO" in parquet_hdr_vcf_header_columns
5622                            ):
5623                                sql_query_annotation_update_info_sets.append(
5624                                    f"""
5625                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5626                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5627                                        ELSE ''
5628                                    END
5629                                """
5630                                )
5631                            # Found in a specific column
5632                            else:
5633                                sql_query_annotation_update_info_sets.append(
5634                                    f"""
5635                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5636                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5637                                        ELSE ''
5638                                    END
5639                                """
5640                                )
5641                                sql_query_annotation_to_agregate.append(
5642                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5643                                )
5644
5645                        # Not to annotate
5646                        else:
5647
5648                            if force_update_annotation:
5649                                annotation_message = "forced"
5650                            else:
5651                                annotation_message = "skipped"
5652
5653                            if annotation_field not in parquet_hdr_vcf_header_infos:
5654                                log.warning(
5655                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5656                                )
5657                            if annotation_fields_new_name in self.get_header().infos:
5658                                log.warning(
5659                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5660                                )
5661
5662                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5663                    # allow_annotation_full_info = True
5664                    allow_annotation_full_info = not force_append_annotation
5665
5666                    if parquet_type in ["regions"]:
5667                        allow_annotation_full_info = False
5668
5669                    if (
5670                        allow_annotation_full_info
5671                        and nb_annotation_field == len(annotation_fields)
5672                        and annotation_fields_all
5673                        and (
5674                            "INFO" in parquet_hdr_vcf_header_columns
5675                            and "INFO" in database.get_extra_columns()
5676                        )
5677                    ):
5678                        log.debug("Column INFO annotation enabled")
5679                        sql_query_annotation_update_info_sets = []
5680                        sql_query_annotation_update_info_sets.append(
5681                            f" table_parquet.INFO "
5682                        )
5683
5684                    if sql_query_annotation_update_info_sets:
5685
5686                        # Annotate
5687                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5688
5689                        # Join query annotation update info sets for SQL
5690                        sql_query_annotation_update_info_sets_sql = ",".join(
5691                            sql_query_annotation_update_info_sets
5692                        )
5693
5694                        # Check chromosomes list (and variants infos)
5695                        sql_query_chromosomes = f"""
5696                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5697                            FROM {table_variants} as table_variants
5698                            GROUP BY table_variants."#CHROM"
5699                            ORDER BY table_variants."#CHROM"
5700                            """
5701                        sql_query_chromosomes_df = self.conn.execute(
5702                            sql_query_chromosomes
5703                        ).df()
5704                        sql_query_chromosomes_dict = {
5705                            entry["CHROM"]: {
5706                                "count": entry["count_variants"],
5707                                "min": entry["min_variants"],
5708                                "max": entry["max_variants"],
5709                            }
5710                            for index, entry in sql_query_chromosomes_df.iterrows()
5711                        }
5712
5713                        # Init
5714                        nb_of_query = 0
5715                        nb_of_variant_annotated = 0
5716                        query_dict = query_dict_remove
5717
5718                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5719                        for chrom in sql_query_chromosomes_dict:
5720
5721                            # Number of variant by chromosome
5722                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5723                                chrom, {}
5724                            ).get("count", 0)
5725
5726                            log.debug(
5727                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5728                            )
5729
5730                            # Annotation with regions database
5731                            if parquet_type in ["regions"]:
5732                                sql_query_annotation_from_clause = f"""
5733                                    FROM (
5734                                        SELECT 
5735                                            '{chrom}' AS \"#CHROM\",
5736                                            table_variants_from.\"POS\" AS \"POS\",
5737                                            {",".join(sql_query_annotation_to_agregate)}
5738                                        FROM {table_variants} as table_variants_from
5739                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5740                                            table_parquet_from."#CHROM" = '{chrom}'
5741                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5742                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5743                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5744                                                )
5745                                        )
5746                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5747                                        GROUP BY table_variants_from.\"POS\"
5748                                        )
5749                                        as table_parquet
5750                                """
5751
5752                                sql_query_annotation_where_clause = """
5753                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5754                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5755                                """
5756
5757                            # Annotation with variants database
5758                            else:
5759                                sql_query_annotation_from_clause = f"""
5760                                    FROM {parquet_file_link} as table_parquet
5761                                """
5762                                sql_query_annotation_where_clause = f"""
5763                                    table_variants."#CHROM" = '{chrom}'
5764                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5765                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5766                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5767                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5768                                """
5769
5770                            # Create update query
5771                            sql_query_annotation_chrom_interval_pos = f"""
5772                                UPDATE {table_variants} as table_variants
5773                                    SET INFO = 
5774                                        concat(
5775                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5776                                                THEN table_variants.INFO
5777                                                ELSE ''
5778                                            END
5779                                            ,
5780                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5781                                                        AND (
5782                                                        concat({sql_query_annotation_update_info_sets_sql})
5783                                                        )
5784                                                        NOT IN ('','.') 
5785                                                    THEN ';'
5786                                                    ELSE ''
5787                                            END
5788                                            ,
5789                                            {sql_query_annotation_update_info_sets_sql}
5790                                            )
5791                                    {sql_query_annotation_from_clause}
5792                                    WHERE {sql_query_annotation_where_clause}
5793                                    ;
5794                                """
5795
5796                            # Add update query to dict
5797                            query_dict[
5798                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5799                            ] = sql_query_annotation_chrom_interval_pos
5800
5801                        nb_of_query = len(query_dict)
5802                        num_query = 0
5803
5804                        # SET max_expression_depth TO x
5805                        self.conn.execute("SET max_expression_depth TO 10000")
5806
5807                        for query_name in query_dict:
5808                            query = query_dict[query_name]
5809                            num_query += 1
5810                            log.info(
5811                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5812                            )
5813                            result = self.conn.execute(query)
5814                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5815                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5816                            log.info(
5817                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5818                            )
5819
5820                        log.info(
5821                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5822                        )
5823
5824                    else:
5825
5826                        log.info(
5827                            f"Annotation '{annotation_name}' - No Annotations available"
5828                        )
5829
5830                    log.debug("Final header: " + str(vcf_reader.infos))
5831
5832        # Remove added columns
5833        for added_column in added_columns:
5834            self.drop_column(column=added_column)
5835
5836    def annotation_splice(self, threads: int = None) -> None:
5837        """
5838        This function annotate with snpEff
5839
5840        :param threads: The number of threads to use
5841        :return: the value of the variable "return_value".
5842        """
5843
5844        # DEBUG
5845        log.debug("Start annotation with splice tools")
5846
5847        # Threads
5848        if not threads:
5849            threads = self.get_threads()
5850        log.debug("Threads: " + str(threads))
5851
5852        # DEBUG
5853        delete_tmp = True
5854        if self.get_config().get("verbosity", "warning") in ["debug"]:
5855            delete_tmp = False
5856            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5857
5858        # Config
5859        config = self.get_config()
5860        log.debug("Config: " + str(config))
5861        splice_config = config.get("tools", {}).get("splice", {})
5862        if not splice_config:
5863            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5864        if not splice_config:
5865            msg_err = "No Splice tool config"
5866            log.error(msg_err)
5867            raise ValueError(msg_err)
5868        log.debug(f"splice_config={splice_config}")
5869
5870        # Config - Folders - Databases
5871        databases_folders = (
5872            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5873        )
5874        log.debug("Databases annotations: " + str(databases_folders))
5875
5876        # Splice docker image
5877        splice_docker_image = splice_config.get("docker").get("image")
5878
5879        # Pull splice image if it's not already there
5880        if not check_docker_image_exists(splice_docker_image):
5881            log.warning(
5882                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5883            )
5884            try:
5885                command(f"docker pull {splice_config.get('docker').get('image')}")
5886            except subprocess.CalledProcessError:
5887                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5888                log.error(msg_err)
5889                raise ValueError(msg_err)
5890                return None
5891
5892        # Config - splice databases
5893        splice_databases = (
5894            config.get("folders", {})
5895            .get("databases", {})
5896            .get("splice", DEFAULT_SPLICE_FOLDER)
5897        )
5898        splice_databases = full_path(splice_databases)
5899
5900        # Param
5901        param = self.get_param()
5902        log.debug("Param: " + str(param))
5903
5904        # Param
5905        options = param.get("annotation", {}).get("splice", {})
5906        log.debug("Options: " + str(options))
5907
5908        # Data
5909        table_variants = self.get_table_variants()
5910
5911        # Check if not empty
5912        log.debug("Check if not empty")
5913        sql_query_chromosomes = (
5914            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5915        )
5916        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5917            log.info("VCF empty")
5918            return None
5919
5920        # Export in VCF
5921        log.debug("Create initial file to annotate")
5922
5923        # Create output folder
5924        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
5925        if not os.path.exists(output_folder):
5926            Path(output_folder).mkdir(parents=True, exist_ok=True)
5927
5928        # Create tmp VCF file
5929        tmp_vcf = NamedTemporaryFile(
5930            prefix=self.get_prefix(),
5931            dir=output_folder,
5932            suffix=".vcf",
5933            delete=False,
5934        )
5935        tmp_vcf_name = tmp_vcf.name
5936
5937        # VCF header
5938        header = self.get_header()
5939
5940        # Existing annotations
5941        for vcf_annotation in self.get_header().infos:
5942
5943            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5944            log.debug(
5945                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5946            )
5947
5948        # Memory limit
5949        if config.get("memory", None):
5950            memory_limit = config.get("memory", "8G").upper()
5951            # upper()
5952        else:
5953            memory_limit = "8G"
5954        log.debug(f"memory_limit: {memory_limit}")
5955
5956        # Export VCF file
5957        self.export_variant_vcf(
5958            vcf_file=tmp_vcf_name,
5959            remove_info=True,
5960            add_samples=True,
5961            index=False,
5962        )
5963
5964        # Create docker container and launch splice analysis
5965        if splice_config:
5966
5967            # Splice mount folders
5968            mount_folders = splice_config.get("mount", {})
5969
5970            # Genome mount
5971            mount_folders[
5972                config.get("folders", {})
5973                .get("databases", {})
5974                .get("genomes", DEFAULT_GENOME_FOLDER)
5975            ] = "ro"
5976
5977            # SpliceAI mount
5978            mount_folders[
5979                config.get("folders", {})
5980                .get("databases", {})
5981                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
5982            ] = "ro"
5983
5984            # Genome mount
5985            mount_folders[
5986                config.get("folders", {})
5987                .get("databases", {})
5988                .get("spip", DEFAULT_SPIP_FOLDER)
5989            ] = "ro"
5990
5991            # Mount folders
5992            mount = []
5993
5994            # Config mount
5995            mount = [
5996                f"-v {full_path(path)}:{full_path(path)}:{mode}"
5997                for path, mode in mount_folders.items()
5998            ]
5999
6000            if any(value for value in splice_config.values() if value is None):
6001                log.warning("At least one splice config parameter is empty")
6002                return None
6003
6004            # Params in splice nf
6005            def check_values(dico: dict):
6006                """
6007                Ensure parameters for NF splice pipeline
6008                """
6009                for key, val in dico.items():
6010                    if key == "genome":
6011                        if any(
6012                            assemb in options.get("genome", {})
6013                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6014                        ):
6015                            yield f"--{key} hg19"
6016                        elif any(
6017                            assemb in options.get("genome", {})
6018                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6019                        ):
6020                            yield f"--{key} hg38"
6021                    elif (
6022                        (isinstance(val, str) and val)
6023                        or isinstance(val, int)
6024                        or isinstance(val, bool)
6025                    ):
6026                        yield f"--{key} {val}"
6027
6028            # Genome
6029            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6030            options["genome"] = genome
6031
6032            # NF params
6033            nf_params = []
6034
6035            # Add options
6036            if options:
6037                nf_params = list(check_values(options))
6038                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6039            else:
6040                log.debug("No NF params provided")
6041
6042            # Add threads
6043            if "threads" not in options.keys():
6044                nf_params.append(f"--threads {threads}")
6045
6046            # Genome path
6047            genome_path = find_genome(
6048                config.get("folders", {})
6049                .get("databases", {})
6050                .get("genomes", DEFAULT_GENOME_FOLDER),
6051                file=f"{genome}.fa",
6052            )
6053            # Add genome path
6054            if not genome_path:
6055                raise ValueError(
6056                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6057                )
6058            else:
6059                log.debug(f"Genome: {genome_path}")
6060                nf_params.append(f"--genome_path {genome_path}")
6061
6062            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6063                """
6064                Setting up updated databases for SPiP and SpliceAI
6065                """
6066
6067                try:
6068
6069                    # SpliceAI assembly transcriptome
6070                    spliceai_assembly = os.path.join(
6071                        config.get("folders", {})
6072                        .get("databases", {})
6073                        .get("spliceai", {}),
6074                        options.get("genome"),
6075                        "transcriptome",
6076                    )
6077                    spip_assembly = options.get("genome")
6078
6079                    spip = find(
6080                        f"transcriptome_{spip_assembly}.RData",
6081                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6082                    )
6083                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6084                    log.debug(f"SPiP annotations: {spip}")
6085                    log.debug(f"SpliceAI annotations: {spliceai}")
6086                    if spip and spliceai:
6087                        return [
6088                            f"--spip_transcriptome {spip}",
6089                            f"--spliceai_annotations {spliceai}",
6090                        ]
6091                    else:
6092                        # TODO crash and go on with basic annotations ?
6093                        # raise ValueError(
6094                        #     "Can't find splice databases in configuration EXIT"
6095                        # )
6096                        log.warning(
6097                            "Can't find splice databases in configuration, use annotations file from image"
6098                        )
6099                except TypeError:
6100                    log.warning(
6101                        "Can't find splice databases in configuration, use annotations file from image"
6102                    )
6103                    return []
6104
6105            # Add options, check if transcriptome option have already beend provided
6106            if (
6107                "spip_transcriptome" not in nf_params
6108                and "spliceai_transcriptome" not in nf_params
6109            ):
6110                splice_reference = splice_annotations(options, config)
6111                if splice_reference:
6112                    nf_params.extend(splice_reference)
6113
6114            nf_params.append(f"--output_folder {output_folder}")
6115
6116            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6117            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6118            log.debug(cmd)
6119
6120            splice_config["docker"]["command"] = cmd
6121
6122            docker_cmd = get_bin_command(
6123                tool="splice",
6124                bin_type="docker",
6125                config=config,
6126                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6127                add_options=f"--name {random_uuid} {' '.join(mount)}",
6128            )
6129
6130            # Docker debug
6131            # if splice_config.get("rm_container"):
6132            #     rm_container = "--rm"
6133            # else:
6134            #     rm_container = ""
6135            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6136
6137            log.debug(docker_cmd)
6138            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6139            log.debug(res.stdout)
6140            if res.stderr:
6141                log.error(res.stderr)
6142            res.check_returncode()
6143        else:
6144            log.warning(f"Splice tool configuration not found: {config}")
6145
6146        # Update variants
6147        log.info("Annotation - Updating...")
6148        # Test find output vcf
6149        log.debug(
6150            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6151        )
6152        output_vcf = []
6153        # Wrong folder to look in
6154        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6155            if (
6156                files
6157                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6158            ):
6159                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6160        # log.debug(os.listdir(options.get("output_folder")))
6161        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6162        if not output_vcf:
6163            log.debug(
6164                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6165            )
6166        else:
6167            # Get new header from annotated vcf
6168            log.debug(f"Initial header: {len(header.infos)} fields")
6169            # Create new header with splice infos
6170            new_vcf = Variants(input=output_vcf[0])
6171            new_vcf_header = new_vcf.get_header().infos
6172            for keys, infos in new_vcf_header.items():
6173                if keys not in header.infos.keys():
6174                    header.infos[keys] = infos
6175            log.debug(f"New header: {len(header.infos)} fields")
6176            log.debug(f"Splice tmp output: {output_vcf[0]}")
6177            self.update_from_vcf(output_vcf[0])
6178
6179        # Remove folder
6180        remove_if_exists(output_folder)
6181
6182    ###
6183    # Prioritization
6184    ###
6185
6186    def get_config_default(self, name: str) -> dict:
6187        """
6188        The function `get_config_default` returns a dictionary containing default configurations for
6189        various calculations and prioritizations.
6190
6191        :param name: The `get_config_default` function returns a dictionary containing default
6192        configurations for different calculations and prioritizations. The `name` parameter is used to
6193        specify which specific configuration to retrieve from the dictionary
6194        :type name: str
6195        :return: The function `get_config_default` returns a dictionary containing default configuration
6196        settings for different calculations and prioritizations. The specific configuration settings are
6197        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6198        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6199        returned. If there is no match, an empty dictionary is returned.
6200        """
6201
6202        config_default = {
6203            "calculations": {
6204                "variant_chr_pos_alt_ref": {
6205                    "type": "sql",
6206                    "name": "variant_chr_pos_alt_ref",
6207                    "description": "Create a variant ID with chromosome, position, alt and ref",
6208                    "available": False,
6209                    "output_column_name": "variant_chr_pos_alt_ref",
6210                    "output_column_type": "String",
6211                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6212                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6213                    "operation_info": True,
6214                },
6215                "VARTYPE": {
6216                    "type": "sql",
6217                    "name": "VARTYPE",
6218                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6219                    "available": True,
6220                    "output_column_name": "VARTYPE",
6221                    "output_column_type": "String",
6222                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6223                    "operation_query": """
6224                            CASE
6225                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6226                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6227                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6228                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6229                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6230                                ELSE 'UNDEFINED'
6231                            END
6232                            """,
6233                    "info_fields": ["SVTYPE"],
6234                    "operation_info": True,
6235                },
6236                "snpeff_hgvs": {
6237                    "type": "python",
6238                    "name": "snpeff_hgvs",
6239                    "description": "HGVS nomenclatures from snpEff annotation",
6240                    "available": True,
6241                    "function_name": "calculation_extract_snpeff_hgvs",
6242                    "function_params": [],
6243                },
6244                "NOMEN": {
6245                    "type": "python",
6246                    "name": "NOMEN",
6247                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6248                    "available": True,
6249                    "function_name": "calculation_extract_nomen",
6250                    "function_params": [],
6251                },
6252                "FINDBYPIPELINE": {
6253                    "type": "python",
6254                    "name": "FINDBYPIPELINE",
6255                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6256                    "available": True,
6257                    "function_name": "calculation_find_by_pipeline",
6258                    "function_params": ["findbypipeline"],
6259                },
6260                "FINDBYSAMPLE": {
6261                    "type": "python",
6262                    "name": "FINDBYSAMPLE",
6263                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6264                    "available": True,
6265                    "function_name": "calculation_find_by_pipeline",
6266                    "function_params": ["findbysample"],
6267                },
6268                "GENOTYPECONCORDANCE": {
6269                    "type": "python",
6270                    "name": "GENOTYPECONCORDANCE",
6271                    "description": "Concordance of genotype for multi caller VCF",
6272                    "available": True,
6273                    "function_name": "calculation_genotype_concordance",
6274                    "function_params": [],
6275                },
6276                "BARCODE": {
6277                    "type": "python",
6278                    "name": "BARCODE",
6279                    "description": "BARCODE as VaRank tool",
6280                    "available": True,
6281                    "function_name": "calculation_barcode",
6282                    "function_params": [],
6283                },
6284                "BARCODEFAMILY": {
6285                    "type": "python",
6286                    "name": "BARCODEFAMILY",
6287                    "description": "BARCODEFAMILY as VaRank tool",
6288                    "available": True,
6289                    "function_name": "calculation_barcode_family",
6290                    "function_params": ["BCF"],
6291                },
6292                "TRIO": {
6293                    "type": "python",
6294                    "name": "TRIO",
6295                    "description": "Inheritance for a trio family",
6296                    "available": True,
6297                    "function_name": "calculation_trio",
6298                    "function_params": [],
6299                },
6300                "VAF": {
6301                    "type": "python",
6302                    "name": "VAF",
6303                    "description": "Variant Allele Frequency (VAF) harmonization",
6304                    "available": True,
6305                    "function_name": "calculation_vaf_normalization",
6306                    "function_params": [],
6307                },
6308                "VAF_stats": {
6309                    "type": "python",
6310                    "name": "VAF_stats",
6311                    "description": "Variant Allele Frequency (VAF) statistics",
6312                    "available": True,
6313                    "function_name": "calculation_genotype_stats",
6314                    "function_params": ["VAF"],
6315                },
6316                "DP_stats": {
6317                    "type": "python",
6318                    "name": "DP_stats",
6319                    "description": "Depth (DP) statistics",
6320                    "available": True,
6321                    "function_name": "calculation_genotype_stats",
6322                    "function_params": ["DP"],
6323                },
6324                "variant_id": {
6325                    "type": "python",
6326                    "name": "variant_id",
6327                    "description": "Variant ID generated from variant position and type",
6328                    "available": True,
6329                    "function_name": "calculation_variant_id",
6330                    "function_params": [],
6331                },
6332            },
6333            "prioritizations": {
6334                "default": {
6335                    "filter": [
6336                        {
6337                            "type": "notequals",
6338                            "value": "!PASS|\\.",
6339                            "score": 0,
6340                            "flag": "FILTERED",
6341                            "comment": ["Bad variant quality"],
6342                        },
6343                        {
6344                            "type": "equals",
6345                            "value": "REJECT",
6346                            "score": -20,
6347                            "flag": "PASS",
6348                            "comment": ["Bad variant quality"],
6349                        },
6350                    ],
6351                    "DP": [
6352                        {
6353                            "type": "gte",
6354                            "value": "50",
6355                            "score": 5,
6356                            "flag": "PASS",
6357                            "comment": ["DP higher than 50"],
6358                        }
6359                    ],
6360                    "ANN": [
6361                        {
6362                            "type": "contains",
6363                            "value": "HIGH",
6364                            "score": 5,
6365                            "flag": "PASS",
6366                            "comment": [
6367                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6368                            ],
6369                        },
6370                        {
6371                            "type": "contains",
6372                            "value": "MODERATE",
6373                            "score": 3,
6374                            "flag": "PASS",
6375                            "comment": [
6376                                "A non-disruptive variant that might change protein effectiveness"
6377                            ],
6378                        },
6379                        {
6380                            "type": "contains",
6381                            "value": "LOW",
6382                            "score": 0,
6383                            "flag": "FILTERED",
6384                            "comment": [
6385                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6386                            ],
6387                        },
6388                        {
6389                            "type": "contains",
6390                            "value": "MODIFIER",
6391                            "score": 0,
6392                            "flag": "FILTERED",
6393                            "comment": [
6394                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6395                            ],
6396                        },
6397                    ],
6398                }
6399            },
6400        }
6401
6402        return config_default.get(name, None)
6403
6404    def get_config_json(
6405        self, name: str, config_dict: dict = {}, config_file: str = None
6406    ) -> dict:
6407        """
6408        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6409        default values, a dictionary, and a file.
6410
6411        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6412        the name of the configuration. It is used to identify and retrieve the configuration settings
6413        for a specific component or module
6414        :type name: str
6415        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6416        dictionary that allows you to provide additional configuration settings or overrides. When you
6417        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6418        the key is the configuration setting you want to override or
6419        :type config_dict: dict
6420        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6421        specify the path to a configuration file that contains additional settings. If provided, the
6422        function will read the contents of this file and update the configuration dictionary with the
6423        values found in the file, overriding any existing values with the
6424        :type config_file: str
6425        :return: The function `get_config_json` returns a dictionary containing the configuration
6426        settings.
6427        """
6428
6429        # Create with default prioritizations
6430        config_default = self.get_config_default(name=name)
6431        configuration = config_default
6432        # log.debug(f"configuration={configuration}")
6433
6434        # Replace prioritizations from dict
6435        for config in config_dict:
6436            configuration[config] = config_dict[config]
6437
6438        # Replace prioritizations from file
6439        config_file = full_path(config_file)
6440        if config_file:
6441            if os.path.exists(config_file):
6442                with open(config_file) as config_file_content:
6443                    config_file_dict = json.load(config_file_content)
6444                for config in config_file_dict:
6445                    configuration[config] = config_file_dict[config]
6446            else:
6447                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6448                log.error(msg_error)
6449                raise ValueError(msg_error)
6450
6451        return configuration
6452
6453    # def get_prioritizations_config(self, prioritizations_config_dict:dict = {}, prioritizations_config_file:str = None) -> dict:
6454
6455    #     # Create with default prioritizations
6456    #     prioritizations_config = self.get_config_default("prioritization")
6457
6458    #     # Replace prioritizations from dict
6459    #     for prioritization_config in prioritizations_config_dict:
6460    #         prioritizations_config[prioritization_config] = prioritizations_config_dict[prioritization_config]
6461
6462    #     # Replace prioritizations from file
6463    #     prioritizations_config_file = full_path(prioritizations_config_file)
6464    #     if prioritizations_config_file:
6465    #         if os.path.exists(prioritizations_config_file):
6466    #             with open(prioritizations_config_file) as prioritizations_config_file_content:
6467    #                 prioritizations_config_file_dict = json.load(prioritizations_config_file_content)
6468    #             for prioritization_config in prioritizations_config_file_dict:
6469    #                 prioritizations_config[prioritization_config] = prioritizations_config_file_dict[prioritization_config]
6470    #         else:
6471    #             log.error(f"Prioritizations config file '{prioritizations_config_file}' does NOT exist")
6472    #             raise ValueError(f"Prioritizations config file '{prioritizations_config_file}' does NOT exist")
6473
6474    #     return prioritizations_config
6475
6476    def prioritization(self) -> None:
6477        """
6478        It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other
6479        INFO fields
6480        """
6481
6482        # Config
6483        config = self.get_config()
6484
6485        # Param
6486        param = self.get_param()
6487
6488        # Quick Prioritizations
6489        # prioritizations = param.get("prioritization", {}).get("prioritizations", "")
6490
6491        # Configuration profiles
6492        prioritization_config_file = param.get("prioritization", {}).get(
6493            "prioritization_config", None
6494        )
6495        prioritization_config_file = full_path(prioritization_config_file)
6496        prioritizations_config = self.get_config_json(
6497            name="prioritizations", config_file=prioritization_config_file
6498        )
6499
6500        # Prioritization options
6501        profiles = param.get("prioritization", {}).get("profiles", [])
6502        if isinstance(profiles, str):
6503            profiles = profiles.split(",")
6504        pzfields = param.get("prioritization", {}).get(
6505            "pzfields", ["PZFlag", "PZScore"]
6506        )
6507        if isinstance(pzfields, str):
6508            pzfields = pzfields.split(",")
6509        default_profile = param.get("prioritization", {}).get("default_profile", None)
6510        pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_")
6511        prioritization_score_mode = param.get("prioritization", {}).get(
6512            "prioritization_score_mode", "HOWARD"
6513        )
6514
6515        # Quick Prioritizations
6516        # prioritizations = param.get("prioritization", {}).get("prioritizations", None)
6517        prioritizations = param.get("prioritizations", None)
6518        if prioritizations:
6519            log.info("Quick Prioritization:")
6520            for profile in prioritizations.split(","):
6521                if profile not in profiles:
6522                    profiles.append(profile)
6523                    log.info(f"   {profile}")
6524
6525        # If profile "ALL" provided, all profiles in the config profiles
6526        if "ALL" in profiles:
6527            profiles = list(prioritizations_config.keys())
6528
6529        for profile in profiles:
6530            if prioritizations_config.get(profile, None):
6531                log.debug(f"Profile '{profile}' configured")
6532            else:
6533                msg_error = f"Profile '{profile}' NOT configured"
6534                log.error(msg_error)
6535                raise ValueError(msg_error)
6536
6537        if profiles:
6538            log.info(f"Prioritization... ")
6539        else:
6540            log.debug(f"No profile defined")
6541            return
6542
6543        if not default_profile and len(profiles):
6544            default_profile = profiles[0]
6545
6546        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6547        log.debug("Profiles to check: " + str(list(profiles)))
6548
6549        # Variables
6550        table_variants = self.get_table_variants(clause="update")
6551
6552        # Added columns
6553        added_columns = []
6554
6555        # Create list of PZfields
6556        # List of PZFields
6557        list_of_pzfields_original = pzfields + [
6558            pzfield + pzfields_sep + profile
6559            for pzfield in pzfields
6560            for profile in profiles
6561        ]
6562        list_of_pzfields = []
6563        log.debug(f"{list_of_pzfields_original}")
6564
6565        # Remove existing PZfields to use if exists
6566        for pzfield in list_of_pzfields_original:
6567            if self.get_header().infos.get(pzfield, None) is None:
6568                list_of_pzfields.append(pzfield)
6569                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6570            else:
6571                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6572
6573        if list_of_pzfields:
6574
6575            # Explode Infos fields
6576            explode_infos_prefix = self.get_explode_infos_prefix()
6577            added_columns += self.explode_infos(prefix=explode_infos_prefix)
6578            extra_infos = self.get_extra_infos()
6579
6580            # PZfields tags description
6581            PZfields_INFOS = {
6582                "PZTags": {
6583                    "ID": "PZTags",
6584                    "Number": ".",
6585                    "Type": "String",
6586                    "Description": "Variant tags based on annotation criteria",
6587                },
6588                "PZScore": {
6589                    "ID": "PZScore",
6590                    "Number": 1,
6591                    "Type": "Integer",
6592                    "Description": "Variant score based on annotation criteria",
6593                },
6594                "PZFlag": {
6595                    "ID": "PZFlag",
6596                    "Number": 1,
6597                    "Type": "String",
6598                    "Description": "Variant flag based on annotation criteria",
6599                },
6600                "PZComment": {
6601                    "ID": "PZComment",
6602                    "Number": ".",
6603                    "Type": "String",
6604                    "Description": "Variant comment based on annotation criteria",
6605                },
6606                "PZInfos": {
6607                    "ID": "PZInfos",
6608                    "Number": ".",
6609                    "Type": "String",
6610                    "Description": "Variant infos based on annotation criteria",
6611                },
6612            }
6613
6614            # Create INFO fields if not exist
6615            for field in PZfields_INFOS:
6616                field_ID = PZfields_INFOS[field]["ID"]
6617                field_description = PZfields_INFOS[field]["Description"]
6618                if field_ID not in self.get_header().infos and field_ID in pzfields:
6619                    field_description = (
6620                        PZfields_INFOS[field]["Description"]
6621                        + f", profile {default_profile}"
6622                    )
6623                    self.get_header().infos[field_ID] = vcf.parser._Info(
6624                        field_ID,
6625                        PZfields_INFOS[field]["Number"],
6626                        PZfields_INFOS[field]["Type"],
6627                        field_description,
6628                        "unknown",
6629                        "unknown",
6630                        code_type_map[PZfields_INFOS[field]["Type"]],
6631                    )
6632
6633            # Create INFO fields if not exist for each profile
6634            for profile in prioritizations_config:
6635                if profile in profiles or profiles == []:
6636                    for field in PZfields_INFOS:
6637                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6638                        field_description = (
6639                            PZfields_INFOS[field]["Description"]
6640                            + f", profile {profile}"
6641                        )
6642                        if (
6643                            field_ID not in self.get_header().infos
6644                            and field in pzfields
6645                        ):
6646                            self.get_header().infos[field_ID] = vcf.parser._Info(
6647                                field_ID,
6648                                PZfields_INFOS[field]["Number"],
6649                                PZfields_INFOS[field]["Type"],
6650                                field_description,
6651                                "unknown",
6652                                "unknown",
6653                                code_type_map[PZfields_INFOS[field]["Type"]],
6654                            )
6655
6656            # Header
6657            for pzfield in list_of_pzfields:
6658                if re.match("PZScore.*", pzfield):
6659                    added_column = self.add_column(
6660                        table_name=table_variants,
6661                        column_name=pzfield,
6662                        column_type="INTEGER",
6663                        default_value="0",
6664                    )
6665                elif re.match("PZFlag.*", pzfield):
6666                    added_column = self.add_column(
6667                        table_name=table_variants,
6668                        column_name=pzfield,
6669                        column_type="BOOLEAN",
6670                        default_value="1",
6671                    )
6672                else:
6673                    added_column = self.add_column(
6674                        table_name=table_variants,
6675                        column_name=pzfield,
6676                        column_type="STRING",
6677                        default_value="''",
6678                    )
6679                added_columns.append(added_column)
6680
6681            # Profiles
6682            if profiles:
6683
6684                # foreach profile in configuration file
6685                for profile in prioritizations_config:
6686
6687                    # If profile is asked in param, or ALL are asked (empty profile [])
6688                    if profile in profiles or profiles == []:
6689                        log.info(f"Profile '{profile}'")
6690
6691                        sql_set_info_option = ""
6692
6693                        sql_set_info = []
6694
6695                        # PZ fields set
6696
6697                        # PZScore
6698                        if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields:
6699                            sql_set_info.append(
6700                                f"""
6701                                    concat(
6702                                        'PZScore{pzfields_sep}{profile}=',
6703                                        PZScore{pzfields_sep}{profile}
6704                                    ) 
6705                                """
6706                            )
6707                            if (
6708                                profile == default_profile
6709                                and "PZScore" in list_of_pzfields
6710                            ):
6711                                sql_set_info.append(
6712                                    f"""
6713                                        concat(
6714                                            'PZScore=',
6715                                            PZScore{pzfields_sep}{profile}
6716                                        )
6717                                    """
6718                                )
6719
6720                        # PZFlag
6721                        if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6722                            sql_set_info.append(
6723                                f"""
6724                                    concat(
6725                                        'PZFlag{pzfields_sep}{profile}=',
6726                                        CASE 
6727                                            WHEN PZFlag{pzfields_sep}{profile}==1
6728                                            THEN 'PASS'
6729                                            WHEN PZFlag{pzfields_sep}{profile}==0
6730                                            THEN 'FILTERED'
6731                                        END
6732                                    ) 
6733                                """
6734                            )
6735                            if (
6736                                profile == default_profile
6737                                and "PZFlag" in list_of_pzfields
6738                            ):
6739                                sql_set_info.append(
6740                                    f"""
6741                                        concat(
6742                                            'PZFlag=',
6743                                            CASE 
6744                                                WHEN PZFlag{pzfields_sep}{profile}==1
6745                                                THEN 'PASS'
6746                                                WHEN PZFlag{pzfields_sep}{profile}==0
6747                                                THEN 'FILTERED'
6748                                            END
6749                                        )
6750                                    """
6751                                )
6752
6753                        # PZComment
6754                        if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields:
6755                            sql_set_info.append(
6756                                f"""
6757                                    CASE
6758                                        WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6759                                        THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile})
6760                                        ELSE ''
6761                                    END
6762                                """
6763                            )
6764                            if (
6765                                profile == default_profile
6766                                and "PZComment" in list_of_pzfields
6767                            ):
6768                                sql_set_info.append(
6769                                    f"""
6770                                        CASE
6771                                            WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6772                                            THEN concat('PZComment=', PZComment{pzfields_sep}{profile})
6773                                            ELSE ''
6774                                        END
6775                                    """
6776                                )
6777
6778                        # PZInfos
6779                        if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields:
6780                            sql_set_info.append(
6781                                f"""
6782                                    CASE
6783                                        WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6784                                        THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile})
6785                                        ELSE ''
6786                                    END
6787                                """
6788                            )
6789                            if (
6790                                profile == default_profile
6791                                and "PZInfos" in list_of_pzfields
6792                            ):
6793                                sql_set_info.append(
6794                                    f"""
6795                                        CASE
6796                                            WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6797                                            THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile})
6798                                            ELSE ''
6799                                        END
6800                                    """
6801                                )
6802
6803                        # Merge PZfields
6804                        sql_set_info_option = ""
6805                        sql_set_sep = ""
6806                        for sql_set in sql_set_info:
6807                            if sql_set_sep:
6808                                sql_set_info_option += f"""
6809                                    , concat('{sql_set_sep}', {sql_set})
6810                                """
6811                            else:
6812                                sql_set_info_option += f"""
6813                                    , {sql_set}
6814                                """
6815                            sql_set_sep = ";"
6816
6817                        sql_queries = []
6818                        for annotation in prioritizations_config[profile]:
6819
6820                            # Check if annotation field is present
6821                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6822                                log.debug(f"Annotation '{annotation}' not in data")
6823                                continue
6824                            else:
6825                                log.debug(f"Annotation '{annotation}' in data")
6826
6827                            # For each criterions
6828                            for criterion in prioritizations_config[profile][
6829                                annotation
6830                            ]:
6831                                criterion_type = criterion["type"]
6832                                criterion_value = criterion["value"]
6833                                criterion_score = criterion.get("score", 0)
6834                                criterion_flag = criterion.get("flag", "PASS")
6835                                criterion_flag_bool = criterion_flag == "PASS"
6836                                criterion_comment = (
6837                                    ", ".join(criterion.get("comment", []))
6838                                    .replace("'", "''")
6839                                    .replace(";", ",")
6840                                    .replace("\t", " ")
6841                                )
6842                                criterion_infos = (
6843                                    str(criterion)
6844                                    .replace("'", "''")
6845                                    .replace(";", ",")
6846                                    .replace("\t", " ")
6847                                )
6848
6849                                sql_set = []
6850                                sql_set_info = []
6851
6852                                # PZ fields set
6853                                if (
6854                                    f"PZScore{pzfields_sep}{profile}"
6855                                    in list_of_pzfields
6856                                ):
6857                                    if prioritization_score_mode == "HOWARD":
6858                                        sql_set.append(
6859                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6860                                        )
6861                                    elif prioritization_score_mode == "VaRank":
6862                                        sql_set.append(
6863                                            f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END"
6864                                        )
6865                                    else:
6866                                        sql_set.append(
6867                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6868                                        )
6869                                if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6870                                    sql_set.append(
6871                                        f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}"
6872                                    )
6873                                if (
6874                                    f"PZComment{pzfields_sep}{profile}"
6875                                    in list_of_pzfields
6876                                ):
6877                                    sql_set.append(
6878                                        f"""
6879                                            PZComment{pzfields_sep}{profile} = 
6880                                                concat(
6881                                                    PZComment{pzfields_sep}{profile},
6882                                                    CASE 
6883                                                        WHEN PZComment{pzfields_sep}{profile}!=''
6884                                                        THEN ', '
6885                                                        ELSE ''
6886                                                    END,
6887                                                    '{criterion_comment}'
6888                                                )
6889                                        """
6890                                    )
6891                                if (
6892                                    f"PZInfos{pzfields_sep}{profile}"
6893                                    in list_of_pzfields
6894                                ):
6895                                    sql_set.append(
6896                                        f"""
6897                                            PZInfos{pzfields_sep}{profile} = 
6898                                                concat(
6899                                                    PZInfos{pzfields_sep}{profile},
6900                                                    '{criterion_infos}'
6901                                                )
6902                                        """
6903                                    )
6904                                sql_set_option = ",".join(sql_set)
6905
6906                                # Criterion and comparison
6907                                try:
6908                                    float(criterion_value)
6909                                    sql_update = f"""
6910                                        UPDATE {table_variants}
6911                                        SET {sql_set_option}
6912                                        WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
6913                                        AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value}
6914                                        """
6915                                except:
6916                                    contains_option = ""
6917                                    if criterion_type == "contains":
6918                                        contains_option = ".*"
6919                                    sql_update = f"""
6920                                        UPDATE {table_variants}
6921                                        SET {sql_set_option}
6922                                        WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
6923                                        """
6924                                sql_queries.append(sql_update)
6925
6926                        # PZTags
6927                        if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields:
6928
6929                            # Create PZFalgs value
6930                            pztags_value = ""
6931                            pztags_sep_default = "|"
6932                            pztags_sep = ""
6933                            for pzfield in pzfields:
6934                                if pzfield not in ["PZTags"]:
6935                                    if (
6936                                        f"{pzfield}{pzfields_sep}{profile}"
6937                                        in list_of_pzfields
6938                                    ):
6939                                        if pzfield in ["PZFlag"]:
6940                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
6941                                                CASE WHEN PZFlag{pzfields_sep}{profile}
6942                                                    THEN 'PASS'
6943                                                    ELSE 'FILTERED'
6944                                                END, '"""
6945                                        else:
6946                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
6947                                        pztags_sep = pztags_sep_default
6948
6949                            # Add Query update for PZFlags
6950                            sql_update_pztags = f"""
6951                                UPDATE {table_variants}
6952                                SET INFO = concat(
6953                                        INFO,
6954                                        CASE WHEN INFO NOT in ('','.')
6955                                                THEN ';'
6956                                                ELSE ''
6957                                        END,
6958                                        'PZTags{pzfields_sep}{profile}={pztags_value}'
6959                                    )
6960                                """
6961                            sql_queries.append(sql_update_pztags)
6962
6963                            # Add Query update for PZFlags for default
6964                            if profile == default_profile:
6965                                sql_update_pztags_default = f"""
6966                                UPDATE {table_variants}
6967                                SET INFO = concat(
6968                                        INFO,
6969                                        ';',
6970                                        'PZTags={pztags_value}'
6971                                    )
6972                                """
6973                                sql_queries.append(sql_update_pztags_default)
6974
6975                        log.info(f"""Profile '{profile}' - Prioritization... """)
6976
6977                        if sql_queries:
6978
6979                            for sql_query in sql_queries:
6980                                log.debug(
6981                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
6982                                )
6983                                self.conn.execute(sql_query)
6984
6985                        log.info(f"""Profile '{profile}' - Update... """)
6986                        sql_query_update = f"""
6987                            UPDATE {table_variants}
6988                            SET INFO =  
6989                                concat(
6990                                    CASE
6991                                        WHEN INFO NOT IN ('','.')
6992                                        THEN concat(INFO, ';')
6993                                        ELSE ''
6994                                    END
6995                                    {sql_set_info_option}
6996                                )
6997                        """
6998                        self.conn.execute(sql_query_update)
6999
7000        else:
7001
7002            log.warning(f"No profiles in parameters")
7003
7004        # Remove added columns
7005        for added_column in added_columns:
7006            self.drop_column(column=added_column)
7007
7008        # Explode INFOS fields into table fields
7009        if self.get_explode_infos():
7010            self.explode_infos(
7011                prefix=self.get_explode_infos_prefix(),
7012                fields=self.get_explode_infos_fields(),
7013                force=True,
7014            )
7015
7016        return
7017
7018    ###
7019    # HGVS
7020    ###
7021
7022    def annotation_hgvs(self, threads: int = None) -> None:
7023        """
7024        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7025        coordinates and alleles.
7026
7027        :param threads: The `threads` parameter is an optional integer that specifies the number of
7028        threads to use for parallel processing. If no value is provided, it will default to the number
7029        of threads obtained from the `get_threads()` method
7030        :type threads: int
7031        """
7032
7033        # Function for each partition of the Dask Dataframe
7034        def partition_function(partition):
7035            """
7036            The function `partition_function` applies the `annotation_hgvs_partition` function to
7037            each row of a DataFrame called `partition`.
7038
7039            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7040            to be processed
7041            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7042            the "partition" dataframe along the axis 1.
7043            """
7044            return partition.apply(annotation_hgvs_partition, axis=1)
7045
7046        def annotation_hgvs_partition(row) -> str:
7047            """
7048            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7049            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7050
7051            :param row: A dictionary-like object that contains the values for the following keys:
7052            :return: a string that contains the HGVS names associated with the given row of data.
7053            """
7054
7055            chr = row["CHROM"]
7056            pos = row["POS"]
7057            ref = row["REF"]
7058            alt = row["ALT"]
7059
7060            # Find list of associated transcripts
7061            transcripts_list = list(
7062                polars_conn.execute(
7063                    f"""
7064                SELECT transcript
7065                FROM refseq_df
7066                WHERE CHROM='{chr}'
7067                AND POS={pos}
7068            """
7069                )["transcript"]
7070            )
7071
7072            # Full HGVS annotation in list
7073            hgvs_full_list = []
7074
7075            for transcript_name in transcripts_list:
7076
7077                # Transcript
7078                transcript = get_transcript(
7079                    transcripts=transcripts, transcript_name=transcript_name
7080                )
7081                # Exon
7082                if use_exon:
7083                    exon = transcript.find_exon_number(pos)
7084                else:
7085                    exon = None
7086                # Protein
7087                transcript_protein = None
7088                if use_protein or add_protein or full_format:
7089                    transcripts_protein = list(
7090                        polars_conn.execute(
7091                            f"""
7092                        SELECT protein
7093                        FROM refseqlink_df
7094                        WHERE transcript='{transcript_name}'
7095                        LIMIT 1
7096                    """
7097                        )["protein"]
7098                    )
7099                    if len(transcripts_protein):
7100                        transcript_protein = transcripts_protein[0]
7101
7102                # HGVS name
7103                hgvs_name = format_hgvs_name(
7104                    chr,
7105                    pos,
7106                    ref,
7107                    alt,
7108                    genome=genome,
7109                    transcript=transcript,
7110                    transcript_protein=transcript_protein,
7111                    exon=exon,
7112                    use_gene=use_gene,
7113                    use_protein=use_protein,
7114                    full_format=full_format,
7115                    use_version=use_version,
7116                    codon_type=codon_type,
7117                )
7118                hgvs_full_list.append(hgvs_name)
7119                if add_protein and not use_protein and not full_format:
7120                    hgvs_name = format_hgvs_name(
7121                        chr,
7122                        pos,
7123                        ref,
7124                        alt,
7125                        genome=genome,
7126                        transcript=transcript,
7127                        transcript_protein=transcript_protein,
7128                        exon=exon,
7129                        use_gene=use_gene,
7130                        use_protein=True,
7131                        full_format=False,
7132                        use_version=use_version,
7133                        codon_type=codon_type,
7134                    )
7135                    hgvs_full_list.append(hgvs_name)
7136
7137            # Create liste of HGVS annotations
7138            hgvs_full = ",".join(hgvs_full_list)
7139
7140            return hgvs_full
7141
7142        # Polars connexion
7143        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7144
7145        # Config
7146        config = self.get_config()
7147
7148        # Databases
7149        # Genome
7150        databases_genomes_folders = (
7151            config.get("folders", {})
7152            .get("databases", {})
7153            .get("genomes", DEFAULT_GENOME_FOLDER)
7154        )
7155        databases_genome = (
7156            config.get("folders", {}).get("databases", {}).get("genomes", "")
7157        )
7158        # refseq database folder
7159        databases_refseq_folders = (
7160            config.get("folders", {})
7161            .get("databases", {})
7162            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7163        )
7164        # refseq
7165        databases_refseq = config.get("databases", {}).get("refSeq", None)
7166        # refSeqLink
7167        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7168
7169        # Param
7170        param = self.get_param()
7171
7172        # Quick HGVS
7173        if "hgvs_options" in param and param.get("hgvs_options", ""):
7174            log.info(f"Quick HGVS Annotation:")
7175            if not param.get("hgvs", None):
7176                param["hgvs"] = {}
7177            for option in param.get("hgvs_options", "").split(","):
7178                option_var_val = option.split("=")
7179                option_var = option_var_val[0]
7180                if len(option_var_val) > 1:
7181                    option_val = option_var_val[1]
7182                else:
7183                    option_val = "True"
7184                if option_val.upper() in ["TRUE"]:
7185                    option_val = True
7186                elif option_val.upper() in ["FALSE"]:
7187                    option_val = False
7188                log.info(f"   {option_var}={option_val}")
7189                param["hgvs"][option_var] = option_val
7190
7191        # Check if HGVS annotation enabled
7192        if "hgvs" in param:
7193            log.info(f"HGVS Annotation... ")
7194            for hgvs_option in param.get("hgvs", {}):
7195                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7196        else:
7197            return
7198
7199        # HGVS Param
7200        param_hgvs = param.get("hgvs", {})
7201        use_exon = param_hgvs.get("use_exon", False)
7202        use_gene = param_hgvs.get("use_gene", False)
7203        use_protein = param_hgvs.get("use_protein", False)
7204        add_protein = param_hgvs.get("add_protein", False)
7205        full_format = param_hgvs.get("full_format", False)
7206        use_version = param_hgvs.get("use_version", False)
7207        codon_type = param_hgvs.get("codon_type", "3")
7208
7209        # refSseq refSeqLink
7210        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7211        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7212
7213        # Assembly
7214        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7215
7216        # Genome
7217        genome_file = None
7218        if find_genome(databases_genome):
7219            genome_file = find_genome(databases_genome)
7220        else:
7221            genome_file = find_genome(
7222                genome_path=databases_genomes_folders, assembly=assembly
7223            )
7224        log.debug("Genome: " + str(genome_file))
7225
7226        # refSseq
7227        refseq_file = find_file_prefix(
7228            input_file=databases_refseq,
7229            prefix="ncbiRefSeq",
7230            folder=databases_refseq_folders,
7231            assembly=assembly,
7232        )
7233        log.debug("refSeq: " + str(refseq_file))
7234
7235        # refSeqLink
7236        refseqlink_file = find_file_prefix(
7237            input_file=databases_refseqlink,
7238            prefix="ncbiRefSeqLink",
7239            folder=databases_refseq_folders,
7240            assembly=assembly,
7241        )
7242        log.debug("refSeqLink: " + str(refseqlink_file))
7243
7244        # Threads
7245        if not threads:
7246            threads = self.get_threads()
7247        log.debug("Threads: " + str(threads))
7248
7249        # Variables
7250        table_variants = self.get_table_variants(clause="update")
7251
7252        # Get variants SNV and InDel only
7253        query_variants = f"""
7254            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7255            FROM {table_variants}
7256            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7257            """
7258        df_variants = self.get_query_to_df(query_variants)
7259
7260        # Added columns
7261        added_columns = []
7262
7263        # Add hgvs column in variants table
7264        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7265        added_column = self.add_column(
7266            table_variants, hgvs_column_name, "STRING", default_value=None
7267        )
7268        added_columns.append(added_column)
7269
7270        log.debug(f"refSeq loading...")
7271        # refSeq in duckDB
7272        refseq_table = get_refseq_table(
7273            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7274        )
7275        # Loading all refSeq in Dataframe
7276        refseq_query = f"""
7277            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7278            FROM {refseq_table}
7279            JOIN df_variants ON (
7280                {refseq_table}.chrom = df_variants.CHROM
7281                AND {refseq_table}.txStart<=df_variants.POS
7282                AND {refseq_table}.txEnd>=df_variants.POS
7283            )
7284        """
7285        refseq_df = self.conn.query(refseq_query).pl()
7286
7287        if refseqlink_file:
7288            log.debug(f"refSeqLink loading...")
7289            # refSeqLink in duckDB
7290            refseqlink_table = get_refseq_table(
7291                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7292            )
7293            # Loading all refSeqLink in Dataframe
7294            protacc_column = "protAcc_with_ver"
7295            mrnaacc_column = "mrnaAcc_with_ver"
7296            refseqlink_query = f"""
7297                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7298                FROM {refseqlink_table} 
7299                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7300                WHERE protAcc_without_ver IS NOT NULL
7301            """
7302            # Polars Dataframe
7303            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7304
7305        # Read RefSeq transcripts into a python dict/model.
7306        log.debug(f"Transcripts loading...")
7307        with tempfile.TemporaryDirectory() as tmpdir:
7308            transcripts_query = f"""
7309                COPY (
7310                    SELECT {refseq_table}.*
7311                    FROM {refseq_table}
7312                    JOIN df_variants ON (
7313                        {refseq_table}.chrom=df_variants.CHROM
7314                        AND {refseq_table}.txStart<=df_variants.POS
7315                        AND {refseq_table}.txEnd>=df_variants.POS
7316                    )
7317                )
7318                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7319            """
7320            self.conn.query(transcripts_query)
7321            with open(f"{tmpdir}/transcript.tsv") as infile:
7322                transcripts = read_transcripts(infile)
7323
7324        # Polars connexion
7325        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7326
7327        log.debug("Genome loading...")
7328        # Read genome sequence using pyfaidx.
7329        genome = Fasta(genome_file)
7330
7331        log.debug("Start annotation HGVS...")
7332
7333        # Create
7334        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7335        ddf = dd.from_pandas(df_variants, npartitions=threads)
7336
7337        # Use dask.dataframe.apply() to apply function on each partition
7338        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7339
7340        # Convert Dask DataFrame to Pandas Dataframe
7341        df = ddf.compute()
7342
7343        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7344        with tempfile.TemporaryDirectory() as tmpdir:
7345            df_parquet = os.path.join(tmpdir, "df.parquet")
7346            df.to_parquet(df_parquet)
7347
7348            # Update hgvs column
7349            update_variant_query = f"""
7350                UPDATE {table_variants}
7351                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7352                FROM read_parquet('{df_parquet}') as df
7353                WHERE variants."#CHROM" = df.CHROM
7354                AND variants.POS = df.POS
7355                AND variants.REF = df.REF
7356                AND variants.ALT = df.ALT
7357                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7358                """
7359            self.execute_query(update_variant_query)
7360
7361        # Update INFO column
7362        sql_query_update = f"""
7363            UPDATE {table_variants}
7364            SET INFO = 
7365                concat(
7366                    CASE 
7367                        WHEN INFO NOT IN ('','.')
7368                        THEN concat(INFO, ';')
7369                        ELSE ''
7370                    END,
7371                    'hgvs=',
7372                    {hgvs_column_name}
7373                )
7374            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7375            """
7376        self.execute_query(sql_query_update)
7377
7378        # Add header
7379        HGVS_INFOS = {
7380            "hgvs": {
7381                "ID": "hgvs",
7382                "Number": ".",
7383                "Type": "String",
7384                "Description": f"HGVS annotatation with HOWARD",
7385            }
7386        }
7387
7388        for field in HGVS_INFOS:
7389            field_ID = HGVS_INFOS[field]["ID"]
7390            field_description = HGVS_INFOS[field]["Description"]
7391            self.get_header().infos[field_ID] = vcf.parser._Info(
7392                field_ID,
7393                HGVS_INFOS[field]["Number"],
7394                HGVS_INFOS[field]["Type"],
7395                field_description,
7396                "unknown",
7397                "unknown",
7398                code_type_map[HGVS_INFOS[field]["Type"]],
7399            )
7400
7401        # Remove added columns
7402        for added_column in added_columns:
7403            self.drop_column(column=added_column)
7404
7405    ###
7406    # Calculation
7407    ###
7408
7409    def get_operations_help(
7410        self, operations_config_dict: dict = {}, operations_config_file: str = None
7411    ) -> list:
7412
7413        # Init
7414        operations_help = []
7415
7416        # operations
7417        operations = self.get_config_json(
7418            name="calculations",
7419            config_dict=operations_config_dict,
7420            config_file=operations_config_file,
7421        )
7422        for op in operations:
7423            op_name = operations[op].get("name", op).upper()
7424            op_description = operations[op].get("description", op_name)
7425            op_available = operations[op].get("available", False)
7426            if op_available:
7427                operations_help.append(f"   {op_name}: {op_description}")
7428
7429        # Sort operations
7430        operations_help.sort()
7431
7432        # insert header
7433        operations_help.insert(0, "Available calculation operations:")
7434
7435        # Return
7436        return operations_help
7437
7438    def calculation(
7439        self,
7440        operations: dict = {},
7441        operations_config_dict: dict = {},
7442        operations_config_file: str = None,
7443    ) -> None:
7444        """
7445        It takes a list of operations, and for each operation, it checks if it's a python or sql
7446        operation, and then calls the appropriate function
7447
7448        param json example:
7449            "calculation": {
7450                "NOMEN": {
7451                    "options": {
7452                        "hgvs_field": "hgvs"
7453                    },
7454                "middle" : null
7455            }
7456        """
7457
7458        # Param
7459        param = self.get_param()
7460
7461        # operations config
7462        operations_config = self.get_config_json(
7463            name="calculations",
7464            config_dict=operations_config_dict,
7465            config_file=operations_config_file,
7466        )
7467
7468        # Upper keys
7469        operations_config = {k.upper(): v for k, v in operations_config.items()}
7470
7471        # Calculations
7472
7473        # Operations from param
7474        operations = param.get("calculation", {}).get("calculations", operations)
7475
7476        # Quick calculation - add
7477        if param.get("calculations", None):
7478            calculations_list = [
7479                value for value in param.get("calculations", "").split(",")
7480            ]
7481            log.info(f"Quick Calculations:")
7482            for calculation_key in calculations_list:
7483                log.info(f"   {calculation_key}")
7484            for calculation_operation in calculations_list:
7485                if calculation_operation.upper() not in operations:
7486                    operations[calculation_operation.upper()] = {}
7487                    add_value_into_dict(
7488                        dict_tree=param,
7489                        sections=[
7490                            "calculation",
7491                            "calculations",
7492                            calculation_operation.upper(),
7493                        ],
7494                        value={},
7495                    )
7496
7497        # Operations for calculation
7498        if not operations:
7499            operations = param.get("calculation", {}).get("calculations", {})
7500
7501        if operations:
7502            log.info(f"Calculations...")
7503
7504        # For each operations
7505        for operation_name in operations:
7506            operation_name = operation_name.upper()
7507            if operation_name not in [""]:
7508                if operation_name in operations_config:
7509                    log.info(f"Calculation '{operation_name}'")
7510                    operation = operations_config[operation_name]
7511                    operation_type = operation.get("type", "sql")
7512                    if operation_type == "python":
7513                        self.calculation_process_function(
7514                            operation=operation, operation_name=operation_name
7515                        )
7516                    elif operation_type == "sql":
7517                        self.calculation_process_sql(
7518                            operation=operation, operation_name=operation_name
7519                        )
7520                    else:
7521                        log.error(
7522                            f"Operations config: Type '{operation_type}' NOT available"
7523                        )
7524                        raise ValueError(
7525                            f"Operations config: Type '{operation_type}' NOT available"
7526                        )
7527                else:
7528                    log.error(
7529                        f"Operations config: Calculation '{operation_name}' NOT available"
7530                    )
7531                    raise ValueError(
7532                        f"Operations config: Calculation '{operation_name}' NOT available"
7533                    )
7534
7535        # Explode INFOS fields into table fields
7536        if self.get_explode_infos():
7537            self.explode_infos(
7538                prefix=self.get_explode_infos_prefix(),
7539                fields=self.get_explode_infos_fields(),
7540                force=True,
7541            )
7542
7543    def calculation_process_sql(
7544        self, operation: dict, operation_name: str = "unknown"
7545    ) -> None:
7546        """
7547        The `calculation_process_sql` function takes in a mathematical operation as a string and
7548        performs the operation, updating the specified table with the result.
7549
7550        :param operation: The `operation` parameter is a dictionary that contains information about the
7551        mathematical operation to be performed. It includes the following keys:
7552        :type operation: dict
7553        :param operation_name: The `operation_name` parameter is a string that represents the name of
7554        the mathematical operation being performed. It is used for logging and error handling purposes,
7555        defaults to unknown
7556        :type operation_name: str (optional)
7557        """
7558
7559        # table variants
7560        table_variants = self.get_table_variants(clause="alter")
7561
7562        # Operation infos
7563        operation_name = operation.get("name", "unknown")
7564        log.debug(f"process sql {operation_name}")
7565        output_column_name = operation.get("output_column_name", operation_name)
7566        output_column_type = operation.get("output_column_type", "String")
7567        prefix = operation.get("explode_infos_prefix", "")
7568        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7569        output_column_description = operation.get(
7570            "output_column_description", f"{operation_name} operation"
7571        )
7572        operation_query = operation.get("operation_query", None)
7573        if isinstance(operation_query, list):
7574            operation_query = " ".join(operation_query)
7575        operation_info_fields = operation.get("info_fields", [])
7576        operation_info_fields_check = operation.get("info_fields_check", False)
7577        operation_info = operation.get("operation_info", True)
7578
7579        if operation_query:
7580
7581            # Info fields check
7582            operation_info_fields_check_result = True
7583            if operation_info_fields_check:
7584                header_infos = self.get_header().infos
7585                for info_field in operation_info_fields:
7586                    operation_info_fields_check_result = (
7587                        operation_info_fields_check_result
7588                        and info_field in header_infos
7589                    )
7590
7591            # If info fields available
7592            if operation_info_fields_check_result:
7593
7594                # Added_columns
7595                added_columns = []
7596
7597                # Create VCF header field
7598                vcf_reader = self.get_header()
7599                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7600                    output_column_name,
7601                    ".",
7602                    output_column_type,
7603                    output_column_description,
7604                    "howard calculation",
7605                    "0",
7606                    self.code_type_map.get(output_column_type),
7607                )
7608
7609                # Explode infos if needed
7610                log.debug(f"calculation_process_sql prefix {prefix}")
7611                added_columns += self.explode_infos(
7612                    prefix=prefix,
7613                    fields=[output_column_name] + operation_info_fields,
7614                    force=True,
7615                )
7616
7617                # Create column
7618                added_column = self.add_column(
7619                    table_name=table_variants,
7620                    column_name=prefix + output_column_name,
7621                    column_type=output_column_type_sql,
7622                    default_value="null",
7623                )
7624                added_columns.append(added_column)
7625
7626                # Operation calculation
7627                try:
7628
7629                    # Query to update calculation column
7630                    sql_update = f"""
7631                        UPDATE {table_variants}
7632                        SET "{prefix}{output_column_name}" = ({operation_query})
7633                    """
7634                    self.conn.execute(sql_update)
7635
7636                    # Add to INFO
7637                    if operation_info:
7638                        sql_update_info = f"""
7639                            UPDATE {table_variants}
7640                            SET "INFO" =
7641                                concat(
7642                                    CASE
7643                                        WHEN "INFO" IS NOT NULL
7644                                        THEN concat("INFO", ';')
7645                                        ELSE ''
7646                                    END,
7647                                    '{output_column_name}=',
7648                                    "{prefix}{output_column_name}"
7649                                )
7650                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7651                        """
7652                        self.conn.execute(sql_update_info)
7653
7654                except:
7655                    log.error(
7656                        f"Operations config: Calculation '{operation_name}' query failed"
7657                    )
7658                    raise ValueError(
7659                        f"Operations config: Calculation '{operation_name}' query failed"
7660                    )
7661
7662                # Remove added columns
7663                for added_column in added_columns:
7664                    log.debug(f"added_column: {added_column}")
7665                    self.drop_column(column=added_column)
7666
7667            else:
7668                log.error(
7669                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7670                )
7671                raise ValueError(
7672                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7673                )
7674
7675        else:
7676            log.error(
7677                f"Operations config: Calculation '{operation_name}' query NOT defined"
7678            )
7679            raise ValueError(
7680                f"Operations config: Calculation '{operation_name}' query NOT defined"
7681            )
7682
7683    def calculation_process_function(
7684        self, operation: dict, operation_name: str = "unknown"
7685    ) -> None:
7686        """
7687        The `calculation_process_function` takes in an operation dictionary and performs the specified
7688        function with the given parameters.
7689
7690        :param operation: The `operation` parameter is a dictionary that contains information about the
7691        operation to be performed. It has the following keys:
7692        :type operation: dict
7693        :param operation_name: The `operation_name` parameter is a string that represents the name of
7694        the operation being performed. It is used for logging purposes, defaults to unknown
7695        :type operation_name: str (optional)
7696        """
7697
7698        operation_name = operation["name"]
7699        log.debug(f"process sql {operation_name}")
7700        function_name = operation["function_name"]
7701        function_params = operation["function_params"]
7702        getattr(self, function_name)(*function_params)
7703
7704    def calculation_variant_id(self) -> None:
7705        """
7706        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7707        updates the INFO field of a variants table with the variant ID.
7708        """
7709
7710        # variant_id annotation field
7711        variant_id_tag = self.get_variant_id_column()
7712        added_columns = [variant_id_tag]
7713
7714        # variant_id hgvs tags"
7715        vcf_infos_tags = {
7716            variant_id_tag: "howard variant ID annotation",
7717        }
7718
7719        # Variants table
7720        table_variants = self.get_table_variants()
7721
7722        # Header
7723        vcf_reader = self.get_header()
7724
7725        # Add variant_id to header
7726        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7727            variant_id_tag,
7728            ".",
7729            "String",
7730            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7731            "howard calculation",
7732            "0",
7733            self.code_type_map.get("String"),
7734        )
7735
7736        # Update
7737        sql_update = f"""
7738            UPDATE {table_variants}
7739            SET "INFO" = 
7740                concat(
7741                    CASE
7742                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7743                        THEN ''
7744                        ELSE concat("INFO", ';')
7745                    END,
7746                    '{variant_id_tag}=',
7747                    "{variant_id_tag}"
7748                )
7749        """
7750        self.conn.execute(sql_update)
7751
7752        # Remove added columns
7753        for added_column in added_columns:
7754            self.drop_column(column=added_column)
7755
7756    def calculation_extract_snpeff_hgvs(self) -> None:
7757        """
7758        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7759        annotation field in a VCF file and adds them as a new column in the variants table.
7760        """
7761
7762        # SnpEff annotation field
7763        snpeff_ann = "ANN"
7764
7765        # SnpEff annotation field
7766        snpeff_hgvs = "snpeff_hgvs"
7767
7768        # Snpeff hgvs tags
7769        vcf_infos_tags = {
7770            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7771        }
7772
7773        # Prefix
7774        prefix = self.get_explode_infos_prefix()
7775        if prefix:
7776            prefix = "INFO/"
7777
7778        # snpEff fields
7779        speff_ann_infos = prefix + snpeff_ann
7780        speff_hgvs_infos = prefix + snpeff_hgvs
7781
7782        # Variants table
7783        table_variants = self.get_table_variants()
7784
7785        # Header
7786        vcf_reader = self.get_header()
7787
7788        # Add columns
7789        added_columns = []
7790
7791        # Explode HGVS field in column
7792        added_columns += self.explode_infos(fields=[snpeff_ann])
7793
7794        if "ANN" in vcf_reader.infos:
7795
7796            log.debug(vcf_reader.infos["ANN"])
7797
7798            # Create variant id
7799            variant_id_column = self.get_variant_id_column()
7800            added_columns += [variant_id_column]
7801
7802            # Create dataframe
7803            dataframe_snpeff_hgvs = self.get_query_to_df(
7804                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
7805            )
7806
7807            # Create main NOMEN column
7808            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
7809                speff_ann_infos
7810            ].apply(lambda x: extract_snpeff_hgvs(str(x)))
7811
7812            # Add snpeff_hgvs to header
7813            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
7814                snpeff_hgvs,
7815                ".",
7816                "String",
7817                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
7818                "howard calculation",
7819                "0",
7820                self.code_type_map.get("String"),
7821            )
7822
7823            # Update
7824            sql_update = f"""
7825                UPDATE variants
7826                SET "INFO" = 
7827                    concat(
7828                        CASE
7829                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7830                            THEN ''
7831                            ELSE concat("INFO", ';')
7832                        END,
7833                        CASE 
7834                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
7835                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
7836                            THEN concat(
7837                                    '{snpeff_hgvs}=',
7838                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
7839                                )
7840                            ELSE ''
7841                        END
7842                    )
7843                FROM dataframe_snpeff_hgvs
7844                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
7845
7846            """
7847            self.conn.execute(sql_update)
7848
7849            # Delete dataframe
7850            del dataframe_snpeff_hgvs
7851            gc.collect()
7852
7853        else:
7854
7855            log.warning(
7856                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
7857            )
7858
7859        # Remove added columns
7860        for added_column in added_columns:
7861            self.drop_column(column=added_column)
7862
7863    def calculation_extract_nomen(self) -> None:
7864        """
7865        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
7866        """
7867
7868        # NOMEN field
7869        field_nomen_dict = "NOMEN_DICT"
7870
7871        # NOMEN structure
7872        nomen_dict = {
7873            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
7874            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
7875            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
7876            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
7877            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
7878            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
7879            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
7880            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
7881            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
7882            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
7883        }
7884
7885        # Param
7886        param = self.get_param()
7887
7888        # Prefix
7889        prefix = self.get_explode_infos_prefix()
7890
7891        # Header
7892        vcf_reader = self.get_header()
7893
7894        # Get HGVS field
7895        hgvs_field = (
7896            param.get("calculation", {})
7897            .get("calculations", {})
7898            .get("NOMEN", {})
7899            .get("options", {})
7900            .get("hgvs_field", "hgvs")
7901        )
7902
7903        # Get transcripts
7904        transcripts_file = (
7905            param.get("calculation", {})
7906            .get("calculations", {})
7907            .get("NOMEN", {})
7908            .get("options", {})
7909            .get("transcripts", None)
7910        )
7911        transcripts_file = full_path(transcripts_file)
7912        transcripts = []
7913        if transcripts_file:
7914            if os.path.exists(transcripts_file):
7915                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
7916                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
7917            else:
7918                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
7919                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
7920
7921        # Added columns
7922        added_columns = []
7923
7924        # Explode HGVS field in column
7925        added_columns += self.explode_infos(fields=[hgvs_field])
7926
7927        # extra infos
7928        extra_infos = self.get_extra_infos()
7929        extra_field = prefix + hgvs_field
7930
7931        if extra_field in extra_infos:
7932
7933            # Create dataframe
7934            dataframe_hgvs = self.get_query_to_df(
7935                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
7936            )
7937
7938            # Create main NOMEN column
7939            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
7940                lambda x: find_nomen(str(x), transcripts=transcripts)
7941            )
7942
7943            # Explode NOMEN Structure and create SQL set for update
7944            sql_nomen_fields = []
7945            for nomen_field in nomen_dict:
7946
7947                # Explode each field into a column
7948                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
7949                    lambda x: dict(x).get(nomen_field, "")
7950                )
7951
7952                # Create VCF header field
7953                vcf_reader.infos[nomen_field] = vcf.parser._Info(
7954                    nomen_field,
7955                    ".",
7956                    "String",
7957                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
7958                    "howard calculation",
7959                    "0",
7960                    self.code_type_map.get("String"),
7961                )
7962                sql_nomen_fields.append(
7963                    f"""
7964                        CASE 
7965                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
7966                            THEN concat(
7967                                    ';{nomen_field}=',
7968                                    dataframe_hgvs."{nomen_field}"
7969                                )
7970                            ELSE ''
7971                        END
7972                    """
7973                )
7974
7975            # SQL set for update
7976            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
7977
7978            # Update
7979            sql_update = f"""
7980                UPDATE variants
7981                SET "INFO" = 
7982                    concat(
7983                        CASE
7984                            WHEN "INFO" IS NULL
7985                            THEN ''
7986                            ELSE "INFO"
7987                        END,
7988                        {sql_nomen_fields_set}
7989                    )
7990                FROM dataframe_hgvs
7991                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
7992                    AND variants."POS" = dataframe_hgvs."POS" 
7993                    AND variants."REF" = dataframe_hgvs."REF"
7994                    AND variants."ALT" = dataframe_hgvs."ALT"
7995            """
7996            self.conn.execute(sql_update)
7997
7998            # Delete dataframe
7999            del dataframe_hgvs
8000            gc.collect()
8001
8002        # Remove added columns
8003        for added_column in added_columns:
8004            self.drop_column(column=added_column)
8005
8006    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8007        """
8008        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8009        pipeline/sample for a variant and updates the variant information in a VCF file.
8010
8011        :param tag: The `tag` parameter is a string that represents the annotation field for the
8012        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8013        VCF header and to update the corresponding field in the variants table, defaults to
8014        findbypipeline
8015        :type tag: str (optional)
8016        """
8017
8018        # if FORMAT and samples
8019        if (
8020            "FORMAT" in self.get_header_columns_as_list()
8021            and self.get_header_sample_list()
8022        ):
8023
8024            # findbypipeline annotation field
8025            findbypipeline_tag = tag
8026
8027            # VCF infos tags
8028            vcf_infos_tags = {
8029                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8030            }
8031
8032            # Prefix
8033            prefix = self.get_explode_infos_prefix()
8034
8035            # Field
8036            findbypipeline_infos = prefix + findbypipeline_tag
8037
8038            # Variants table
8039            table_variants = self.get_table_variants()
8040
8041            # Header
8042            vcf_reader = self.get_header()
8043
8044            # Create variant id
8045            variant_id_column = self.get_variant_id_column()
8046            added_columns = [variant_id_column]
8047
8048            # variant_id, FORMAT and samples
8049            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8050                self.get_header_sample_list()
8051            )
8052
8053            # Create dataframe
8054            dataframe_findbypipeline = self.get_query_to_df(
8055                f""" SELECT {samples_fields} FROM {table_variants} """
8056            )
8057
8058            # Create findbypipeline column
8059            dataframe_findbypipeline[findbypipeline_infos] = (
8060                dataframe_findbypipeline.apply(
8061                    lambda row: findbypipeline(
8062                        row, samples=self.get_header_sample_list()
8063                    ),
8064                    axis=1,
8065                )
8066            )
8067
8068            # Add snpeff_hgvs to header
8069            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8070                findbypipeline_tag,
8071                ".",
8072                "String",
8073                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8074                "howard calculation",
8075                "0",
8076                self.code_type_map.get("String"),
8077            )
8078
8079            # Update
8080            sql_update = f"""
8081                UPDATE variants
8082                SET "INFO" = 
8083                    concat(
8084                        CASE
8085                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8086                            THEN ''
8087                            ELSE concat("INFO", ';')
8088                        END,
8089                        CASE 
8090                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8091                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8092                            THEN concat(
8093                                    '{findbypipeline_tag}=',
8094                                    dataframe_findbypipeline."{findbypipeline_infos}"
8095                                )
8096                            ELSE ''
8097                        END
8098                    )
8099                FROM dataframe_findbypipeline
8100                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8101            """
8102            self.conn.execute(sql_update)
8103
8104            # Remove added columns
8105            for added_column in added_columns:
8106                self.drop_column(column=added_column)
8107
8108            # Delete dataframe
8109            del dataframe_findbypipeline
8110            gc.collect()
8111
8112    def calculation_genotype_concordance(self) -> None:
8113        """
8114        The function `calculation_genotype_concordance` calculates the genotype concordance for
8115        multi-caller VCF files and updates the variant information in the database.
8116        """
8117
8118        # if FORMAT and samples
8119        if (
8120            "FORMAT" in self.get_header_columns_as_list()
8121            and self.get_header_sample_list()
8122        ):
8123
8124            # genotypeconcordance annotation field
8125            genotypeconcordance_tag = "genotypeconcordance"
8126
8127            # VCF infos tags
8128            vcf_infos_tags = {
8129                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8130            }
8131
8132            # Prefix
8133            prefix = self.get_explode_infos_prefix()
8134
8135            # Field
8136            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8137
8138            # Variants table
8139            table_variants = self.get_table_variants()
8140
8141            # Header
8142            vcf_reader = self.get_header()
8143
8144            # Create variant id
8145            variant_id_column = self.get_variant_id_column()
8146            added_columns = [variant_id_column]
8147
8148            # variant_id, FORMAT and samples
8149            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8150                self.get_header_sample_list()
8151            )
8152
8153            # Create dataframe
8154            dataframe_genotypeconcordance = self.get_query_to_df(
8155                f""" SELECT {samples_fields} FROM {table_variants} """
8156            )
8157
8158            # Create genotypeconcordance column
8159            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8160                dataframe_genotypeconcordance.apply(
8161                    lambda row: genotypeconcordance(
8162                        row, samples=self.get_header_sample_list()
8163                    ),
8164                    axis=1,
8165                )
8166            )
8167
8168            # Add genotypeconcordance to header
8169            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8170                genotypeconcordance_tag,
8171                ".",
8172                "String",
8173                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8174                "howard calculation",
8175                "0",
8176                self.code_type_map.get("String"),
8177            )
8178
8179            # Update
8180            sql_update = f"""
8181                UPDATE variants
8182                SET "INFO" = 
8183                    concat(
8184                        CASE
8185                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8186                            THEN ''
8187                            ELSE concat("INFO", ';')
8188                        END,
8189                        CASE
8190                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8191                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8192                            THEN concat(
8193                                    '{genotypeconcordance_tag}=',
8194                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8195                                )
8196                            ELSE ''
8197                        END
8198                    )
8199                FROM dataframe_genotypeconcordance
8200                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8201            """
8202            self.conn.execute(sql_update)
8203
8204            # Remove added columns
8205            for added_column in added_columns:
8206                self.drop_column(column=added_column)
8207
8208            # Delete dataframe
8209            del dataframe_genotypeconcordance
8210            gc.collect()
8211
8212    def calculation_barcode(self, tag: str = "barcode") -> None:
8213        """
8214        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8215        updates the INFO field in the file with the calculated barcode values.
8216        """
8217
8218        # if FORMAT and samples
8219        if (
8220            "FORMAT" in self.get_header_columns_as_list()
8221            and self.get_header_sample_list()
8222        ):
8223
8224            # barcode annotation field
8225            if not tag:
8226                tag = "barcode"
8227
8228            # VCF infos tags
8229            vcf_infos_tags = {
8230                tag: "barcode calculation (VaRank)",
8231            }
8232
8233            # Prefix
8234            prefix = self.get_explode_infos_prefix()
8235
8236            # Field
8237            barcode_infos = prefix + tag
8238
8239            # Variants table
8240            table_variants = self.get_table_variants()
8241
8242            # Header
8243            vcf_reader = self.get_header()
8244
8245            # Create variant id
8246            variant_id_column = self.get_variant_id_column()
8247            added_columns = [variant_id_column]
8248
8249            # variant_id, FORMAT and samples
8250            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8251                self.get_header_sample_list()
8252            )
8253
8254            # Create dataframe
8255            dataframe_barcode = self.get_query_to_df(
8256                f""" SELECT {samples_fields} FROM {table_variants} """
8257            )
8258
8259            # Create barcode column
8260            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8261                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8262            )
8263
8264            # Add barcode to header
8265            vcf_reader.infos[tag] = vcf.parser._Info(
8266                tag,
8267                ".",
8268                "String",
8269                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8270                "howard calculation",
8271                "0",
8272                self.code_type_map.get("String"),
8273            )
8274
8275            # Update
8276            sql_update = f"""
8277                UPDATE {table_variants}
8278                SET "INFO" = 
8279                    concat(
8280                        CASE
8281                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8282                            THEN ''
8283                            ELSE concat("INFO", ';')
8284                        END,
8285                        CASE
8286                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8287                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8288                            THEN concat(
8289                                    '{tag}=',
8290                                    dataframe_barcode."{barcode_infos}"
8291                                )
8292                            ELSE ''
8293                        END
8294                    )
8295                FROM dataframe_barcode
8296                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8297            """
8298            self.conn.execute(sql_update)
8299
8300            # Remove added columns
8301            for added_column in added_columns:
8302                self.drop_column(column=added_column)
8303
8304            # Delete dataframe
8305            del dataframe_barcode
8306            gc.collect()
8307
8308    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8309        """
8310        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8311        and updates the INFO field in the file with the calculated barcode values.
8312
8313        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8314        the barcode tag that will be added to the VCF file during the calculation process. If no value
8315        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8316        :type tag: str (optional)
8317        """
8318
8319        # if FORMAT and samples
8320        if (
8321            "FORMAT" in self.get_header_columns_as_list()
8322            and self.get_header_sample_list()
8323        ):
8324
8325            # barcode annotation field
8326            if not tag:
8327                tag = "BCF"
8328
8329            # VCF infos tags
8330            vcf_infos_tags = {
8331                tag: "barcode family calculation",
8332                f"{tag}S": "barcode family samples",
8333            }
8334
8335            # Param
8336            param = self.get_param()
8337            log.debug(f"param={param}")
8338
8339            # Prefix
8340            prefix = self.get_explode_infos_prefix()
8341
8342            # PED param
8343            ped = (
8344                param.get("calculation", {})
8345                .get("calculations", {})
8346                .get("BARCODEFAMILY", {})
8347                .get("family_pedigree", None)
8348            )
8349            log.debug(f"ped={ped}")
8350
8351            # Load PED
8352            if ped:
8353
8354                # Pedigree is a file
8355                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8356                    log.debug("Pedigree is file")
8357                    with open(full_path(ped)) as ped:
8358                        ped = json.load(ped)
8359
8360                # Pedigree is a string
8361                elif isinstance(ped, str):
8362                    log.debug("Pedigree is str")
8363                    try:
8364                        ped = json.loads(ped)
8365                        log.debug("Pedigree is json str")
8366                    except ValueError as e:
8367                        ped_samples = ped.split(",")
8368                        ped = {}
8369                        for ped_sample in ped_samples:
8370                            ped[ped_sample] = ped_sample
8371
8372                # Pedigree is a dict
8373                elif isinstance(ped, dict):
8374                    log.debug("Pedigree is dict")
8375
8376                # Pedigree is not well formatted
8377                else:
8378                    msg_error = "Pedigree not well formatted"
8379                    log.error(msg_error)
8380                    raise ValueError(msg_error)
8381
8382                # Construct list
8383                ped_samples = list(ped.values())
8384
8385            else:
8386                log.debug("Pedigree not defined. Take all samples")
8387                ped_samples = self.get_header_sample_list()
8388                ped = {}
8389                for ped_sample in ped_samples:
8390                    ped[ped_sample] = ped_sample
8391
8392            # Check pedigree
8393            if not ped or len(ped) == 0:
8394                msg_error = f"Error in pedigree: samples {ped_samples}"
8395                log.error(msg_error)
8396                raise ValueError(msg_error)
8397
8398            # Log
8399            log.info(
8400                "Calculation 'BARCODEFAMILY' - Samples: "
8401                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8402            )
8403            log.debug(f"ped_samples={ped_samples}")
8404
8405            # Field
8406            barcode_infos = prefix + tag
8407
8408            # Variants table
8409            table_variants = self.get_table_variants()
8410
8411            # Header
8412            vcf_reader = self.get_header()
8413
8414            # Create variant id
8415            variant_id_column = self.get_variant_id_column()
8416            added_columns = [variant_id_column]
8417
8418            # variant_id, FORMAT and samples
8419            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8420                ped_samples
8421            )
8422
8423            # Create dataframe
8424            dataframe_barcode = self.get_query_to_df(
8425                f""" SELECT {samples_fields} FROM {table_variants} """
8426            )
8427
8428            # Create barcode column
8429            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8430                lambda row: barcode(row, samples=ped_samples), axis=1
8431            )
8432
8433            # Add barcode family to header
8434            # Add vaf_normalization to header
8435            vcf_reader.formats[tag] = vcf.parser._Format(
8436                id=tag,
8437                num=".",
8438                type="String",
8439                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8440                type_code=self.code_type_map.get("String"),
8441            )
8442            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8443                id=f"{tag}S",
8444                num=".",
8445                type="String",
8446                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8447                type_code=self.code_type_map.get("String"),
8448            )
8449
8450            # Update
8451            # for sample in ped_samples:
8452            sql_update_set = []
8453            for sample in self.get_header_sample_list() + ["FORMAT"]:
8454                if sample in ped_samples:
8455                    value = f'dataframe_barcode."{barcode_infos}"'
8456                    value_samples = "'" + ",".join(ped_samples) + "'"
8457                elif sample == "FORMAT":
8458                    value = f"'{tag}'"
8459                    value_samples = f"'{tag}S'"
8460                else:
8461                    value = "'.'"
8462                    value_samples = "'.'"
8463                format_regex = r"[a-zA-Z0-9\s]"
8464                sql_update_set.append(
8465                    f"""
8466                        "{sample}" = 
8467                        concat(
8468                            CASE
8469                                WHEN {table_variants}."{sample}" = './.'
8470                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8471                                ELSE {table_variants}."{sample}"
8472                            END,
8473                            ':',
8474                            {value},
8475                            ':',
8476                            {value_samples}
8477                        )
8478                    """
8479                )
8480
8481            sql_update_set_join = ", ".join(sql_update_set)
8482            sql_update = f"""
8483                UPDATE {table_variants}
8484                SET {sql_update_set_join}
8485                FROM dataframe_barcode
8486                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8487            """
8488            self.conn.execute(sql_update)
8489
8490            # Remove added columns
8491            for added_column in added_columns:
8492                self.drop_column(column=added_column)
8493
8494            # Delete dataframe
8495            del dataframe_barcode
8496            gc.collect()
8497
8498    def calculation_trio(self) -> None:
8499        """
8500        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8501        information to the INFO field of each variant.
8502        """
8503
8504        # if FORMAT and samples
8505        if (
8506            "FORMAT" in self.get_header_columns_as_list()
8507            and self.get_header_sample_list()
8508        ):
8509
8510            # trio annotation field
8511            trio_tag = "trio"
8512
8513            # VCF infos tags
8514            vcf_infos_tags = {
8515                "trio": "trio calculation",
8516            }
8517
8518            # Param
8519            param = self.get_param()
8520
8521            # Prefix
8522            prefix = self.get_explode_infos_prefix()
8523
8524            # Trio param
8525            trio_ped = (
8526                param.get("calculation", {})
8527                .get("calculations", {})
8528                .get("TRIO", {})
8529                .get("trio_pedigree", None)
8530            )
8531
8532            # Load trio
8533            if trio_ped:
8534
8535                # Trio pedigree is a file
8536                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8537                    log.debug("TRIO pedigree is file")
8538                    with open(full_path(trio_ped)) as trio_ped:
8539                        trio_ped = json.load(trio_ped)
8540
8541                # Trio pedigree is a string
8542                elif isinstance(trio_ped, str):
8543                    log.debug("TRIO pedigree is str")
8544                    try:
8545                        trio_ped = json.loads(trio_ped)
8546                        log.debug("TRIO pedigree is json str")
8547                    except ValueError as e:
8548                        trio_samples = trio_ped.split(",")
8549                        if len(trio_samples) == 3:
8550                            trio_ped = {
8551                                "father": trio_samples[0],
8552                                "mother": trio_samples[1],
8553                                "child": trio_samples[2],
8554                            }
8555                            log.debug("TRIO pedigree is list str")
8556                        else:
8557                            msg_error = "TRIO pedigree not well formatted"
8558                            log.error(msg_error)
8559                            raise ValueError(msg_error)
8560
8561                # Trio pedigree is a dict
8562                elif isinstance(trio_ped, dict):
8563                    log.debug("TRIO pedigree is dict")
8564
8565                # Trio pedigree is not well formatted
8566                else:
8567                    msg_error = "TRIO pedigree not well formatted"
8568                    log.error(msg_error)
8569                    raise ValueError(msg_error)
8570
8571                # Construct trio list
8572                trio_samples = [
8573                    trio_ped.get("father", ""),
8574                    trio_ped.get("mother", ""),
8575                    trio_ped.get("child", ""),
8576                ]
8577
8578            else:
8579                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8580                samples_list = self.get_header_sample_list()
8581                if len(samples_list) >= 3:
8582                    trio_samples = self.get_header_sample_list()[0:3]
8583                    trio_ped = {
8584                        "father": trio_samples[0],
8585                        "mother": trio_samples[1],
8586                        "child": trio_samples[2],
8587                    }
8588                else:
8589                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8590                    log.error(msg_error)
8591                    raise ValueError(msg_error)
8592
8593            # Check trio pedigree
8594            if not trio_ped or len(trio_ped) != 3:
8595                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8596                log.error(msg_error)
8597                raise ValueError(msg_error)
8598
8599            # Log
8600            log.info(
8601                f"Calculation 'TRIO' - Samples: "
8602                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8603            )
8604
8605            # Field
8606            trio_infos = prefix + trio_tag
8607
8608            # Variants table
8609            table_variants = self.get_table_variants()
8610
8611            # Header
8612            vcf_reader = self.get_header()
8613
8614            # Create variant id
8615            variant_id_column = self.get_variant_id_column()
8616            added_columns = [variant_id_column]
8617
8618            # variant_id, FORMAT and samples
8619            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8620                self.get_header_sample_list()
8621            )
8622
8623            # Create dataframe
8624            dataframe_trio = self.get_query_to_df(
8625                f""" SELECT {samples_fields} FROM {table_variants} """
8626            )
8627
8628            # Create trio column
8629            dataframe_trio[trio_infos] = dataframe_trio.apply(
8630                lambda row: trio(row, samples=trio_samples), axis=1
8631            )
8632
8633            # Add trio to header
8634            vcf_reader.infos[trio_tag] = vcf.parser._Info(
8635                trio_tag,
8636                ".",
8637                "String",
8638                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
8639                "howard calculation",
8640                "0",
8641                self.code_type_map.get("String"),
8642            )
8643
8644            # Update
8645            sql_update = f"""
8646                UPDATE {table_variants}
8647                SET "INFO" = 
8648                    concat(
8649                        CASE
8650                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8651                            THEN ''
8652                            ELSE concat("INFO", ';')
8653                        END,
8654                        CASE
8655                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
8656                             AND dataframe_trio."{trio_infos}" NOT NULL
8657                            THEN concat(
8658                                    '{trio_tag}=',
8659                                    dataframe_trio."{trio_infos}"
8660                                )
8661                            ELSE ''
8662                        END
8663                    )
8664                FROM dataframe_trio
8665                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
8666            """
8667            self.conn.execute(sql_update)
8668
8669            # Remove added columns
8670            for added_column in added_columns:
8671                self.drop_column(column=added_column)
8672
8673            # Delete dataframe
8674            del dataframe_trio
8675            gc.collect()
8676
8677    def calculation_vaf_normalization(self) -> None:
8678        """
8679        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
8680        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
8681        :return: The function does not return anything.
8682        """
8683
8684        # if FORMAT and samples
8685        if (
8686            "FORMAT" in self.get_header_columns_as_list()
8687            and self.get_header_sample_list()
8688        ):
8689
8690            # vaf_normalization annotation field
8691            vaf_normalization_tag = "VAF"
8692
8693            # VCF infos tags
8694            vcf_infos_tags = {
8695                "VAF": "VAF Variant Frequency",
8696            }
8697
8698            # Prefix
8699            prefix = self.get_explode_infos_prefix()
8700
8701            # Variants table
8702            table_variants = self.get_table_variants()
8703
8704            # Header
8705            vcf_reader = self.get_header()
8706
8707            # Do not calculate if VAF already exists
8708            if "VAF" in vcf_reader.formats:
8709                log.debug("VAF already on genotypes")
8710                return
8711
8712            # Create variant id
8713            variant_id_column = self.get_variant_id_column()
8714            added_columns = [variant_id_column]
8715
8716            # variant_id, FORMAT and samples
8717            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8718                self.get_header_sample_list()
8719            )
8720
8721            # Create dataframe
8722            dataframe_vaf_normalization = self.get_query_to_df(
8723                f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
8724            )
8725
8726            vaf_normalization_set = []
8727
8728            # for each sample vaf_normalization
8729            for sample in self.get_header_sample_list():
8730                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
8731                    lambda row: vaf_normalization(row, sample=sample), axis=1
8732                )
8733                vaf_normalization_set.append(
8734                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
8735                )
8736
8737            # Add VAF to FORMAT
8738            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
8739                "FORMAT"
8740            ].apply(lambda x: str(x) + ":VAF")
8741            vaf_normalization_set.append(
8742                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
8743            )
8744
8745            # Add vaf_normalization to header
8746            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
8747                id=vaf_normalization_tag,
8748                num="1",
8749                type="Float",
8750                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
8751                type_code=self.code_type_map.get("Float"),
8752            )
8753
8754            # Create fields to add in INFO
8755            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
8756
8757            # Update
8758            sql_update = f"""
8759                UPDATE {table_variants}
8760                SET {sql_vaf_normalization_set}
8761                FROM dataframe_vaf_normalization
8762                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
8763
8764            """
8765            self.conn.execute(sql_update)
8766
8767            # Remove added columns
8768            for added_column in added_columns:
8769                self.drop_column(column=added_column)
8770
8771            # Delete dataframe
8772            del dataframe_vaf_normalization
8773            gc.collect()
8774
8775    def calculation_genotype_stats(self, info: str = "VAF") -> None:
8776        """
8777        The `calculation_genotype_stats` function calculates genotype statistics for a given information
8778        field in a VCF file and updates the INFO column of the variants table with the calculated
8779        statistics.
8780
8781        :param info: The `info` parameter is a string that represents the type of information for which
8782        genotype statistics are calculated. It is used to generate various VCF info tags for the
8783        statistics, such as the number of occurrences, the list of values, the minimum value, the
8784        maximum value, the mean, the median, defaults to VAF
8785        :type info: str (optional)
8786        """
8787
8788        # if FORMAT and samples
8789        if (
8790            "FORMAT" in self.get_header_columns_as_list()
8791            and self.get_header_sample_list()
8792        ):
8793
8794            # vaf_stats annotation field
8795            vaf_stats_tag = info + "_stats"
8796
8797            # VCF infos tags
8798            vcf_infos_tags = {
8799                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
8800                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
8801                info + "_stats_min": f"genotype {info} Statistics - min {info}",
8802                info + "_stats_max": f"genotype {info} Statistics - max {info}",
8803                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
8804                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
8805                info
8806                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
8807            }
8808
8809            # Prefix
8810            prefix = self.get_explode_infos_prefix()
8811
8812            # Field
8813            vaf_stats_infos = prefix + vaf_stats_tag
8814
8815            # Variants table
8816            table_variants = self.get_table_variants()
8817
8818            # Header
8819            vcf_reader = self.get_header()
8820
8821            # Create variant id
8822            variant_id_column = self.get_variant_id_column()
8823            added_columns = [variant_id_column]
8824
8825            # variant_id, FORMAT and samples
8826            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8827                self.get_header_sample_list()
8828            )
8829
8830            # Create dataframe
8831            dataframe_vaf_stats = self.get_query_to_df(
8832                f""" SELECT {samples_fields} FROM {table_variants} """
8833            )
8834
8835            # Create vaf_stats column
8836            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
8837                lambda row: genotype_stats(
8838                    row, samples=self.get_header_sample_list(), info=info
8839                ),
8840                axis=1,
8841            )
8842
8843            # List of vcf tags
8844            sql_vaf_stats_fields = []
8845
8846            # Check all VAF stats infos
8847            for stat in vcf_infos_tags:
8848
8849                # Extract stats
8850                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
8851                    lambda x: dict(x).get(stat, "")
8852                )
8853
8854                # Add snpeff_hgvs to header
8855                vcf_reader.infos[stat] = vcf.parser._Info(
8856                    stat,
8857                    ".",
8858                    "String",
8859                    vcf_infos_tags.get(stat, "genotype statistics"),
8860                    "howard calculation",
8861                    "0",
8862                    self.code_type_map.get("String"),
8863                )
8864
8865                if len(sql_vaf_stats_fields):
8866                    sep = ";"
8867                else:
8868                    sep = ""
8869
8870                # Create fields to add in INFO
8871                sql_vaf_stats_fields.append(
8872                    f"""
8873                        CASE
8874                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
8875                            THEN concat(
8876                                    '{sep}{stat}=',
8877                                    dataframe_vaf_stats."{stat}"
8878                                )
8879                            ELSE ''
8880                        END
8881                    """
8882                )
8883
8884            # SQL set for update
8885            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
8886
8887            # Update
8888            sql_update = f"""
8889                UPDATE variants
8890                SET "INFO" = 
8891                    concat(
8892                        CASE
8893                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8894                            THEN ''
8895                            ELSE concat("INFO", ';')
8896                        END,
8897                        {sql_vaf_stats_fields_set}
8898                    )
8899                FROM dataframe_vaf_stats
8900                WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
8901
8902            """
8903            self.conn.execute(sql_update)
8904
8905            # Remove added columns
8906            for added_column in added_columns:
8907                self.drop_column(column=added_column)
8908
8909            # Delete dataframe
8910            del dataframe_vaf_stats
8911            gc.collect()
class Variants:
  34class Variants:
  35
  36    def __init__(
  37        self,
  38        conn=None,
  39        input: str = None,
  40        output: str = None,
  41        config: dict = {},
  42        param: dict = {},
  43        load: bool = False,
  44    ) -> None:
  45        """
  46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
  47        header
  48
  49        :param conn: the connection to the database
  50        :param input: the input file
  51        :param output: the output file
  52        :param config: a dictionary containing the configuration of the model
  53        :param param: a dictionary containing the parameters of the model
  54        """
  55
  56        # Init variables
  57        self.init_variables()
  58
  59        # Input
  60        self.set_input(input)
  61
  62        # Config
  63        self.set_config(config)
  64
  65        # Param
  66        self.set_param(param)
  67
  68        # Output
  69        self.set_output(output)
  70
  71        # connexion
  72        self.set_connexion(conn)
  73
  74        # Header
  75        self.set_header()
  76
  77        # Load data
  78        if load:
  79            self.load_data()
  80
  81    def set_input(self, input: str = None) -> None:
  82        """
  83        The function takes a file name as input, splits the file name into a name and an extension, and
  84        then sets the input_name, input_extension, and input_format attributes of the class
  85
  86        :param input: The input file
  87        """
  88
  89        if input and not isinstance(input, str):
  90            try:
  91                self.input = input.name
  92            except:
  93                log.error(f"Input file '{input} in bad format")
  94                raise ValueError(f"Input file '{input} in bad format")
  95        else:
  96            self.input = input
  97
  98        # Input format
  99        if input:
 100            input_name, input_extension = os.path.splitext(self.input)
 101            self.input_name = input_name
 102            self.input_extension = input_extension
 103            self.input_format = self.input_extension.replace(".", "")
 104
 105    def set_config(self, config: dict) -> None:
 106        """
 107        This function takes in a config object and sets it as the config object for the class
 108
 109        :param config: The configuration object
 110        """
 111        self.config = config
 112
 113    def set_param(self, param: dict) -> None:
 114        """
 115        This function takes in a param object and sets it as the param object for the class
 116
 117        :param param: The paramters object
 118        """
 119        self.param = param
 120
 121    def init_variables(self) -> None:
 122        """
 123        This function initializes the variables that will be used in the rest of the class
 124        """
 125        self.prefix = "howard"
 126        self.table_variants = "variants"
 127        self.dataframe = None
 128
 129        self.comparison_map = {
 130            "gt": ">",
 131            "gte": ">=",
 132            "lt": "<",
 133            "lte": "<=",
 134            "equals": "=",
 135            "contains": "SIMILAR TO",
 136        }
 137
 138        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
 139
 140        self.code_type_map_to_sql = {
 141            "Integer": "INTEGER",
 142            "String": "VARCHAR",
 143            "Float": "FLOAT",
 144            "Flag": "VARCHAR",
 145        }
 146
 147        self.index_additionnal_fields = []
 148
 149    def get_indexing(self) -> bool:
 150        """
 151        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
 152        returns False.
 153        :return: The value of the indexing parameter.
 154        """
 155        return self.get_param().get("indexing", False)
 156
 157    def get_connexion_config(self) -> dict:
 158        """
 159        The function `get_connexion_config` returns a dictionary containing the configuration for a
 160        connection, including the number of threads and memory limit.
 161        :return: a dictionary containing the configuration for the Connexion library.
 162        """
 163
 164        # config
 165        config = self.get_config()
 166
 167        # Connexion config
 168        connexion_config = {}
 169        threads = self.get_threads()
 170
 171        # Threads
 172        if threads:
 173            connexion_config["threads"] = threads
 174
 175        # Memory
 176        # if config.get("memory", None):
 177        #     connexion_config["memory_limit"] = config.get("memory")
 178        if self.get_memory():
 179            connexion_config["memory_limit"] = self.get_memory()
 180
 181        # Temporary directory
 182        if config.get("tmp", None):
 183            connexion_config["temp_directory"] = config.get("tmp")
 184
 185        # Access
 186        if config.get("access", None):
 187            access = config.get("access")
 188            if access in ["RO"]:
 189                access = "READ_ONLY"
 190            elif access in ["RW"]:
 191                access = "READ_WRITE"
 192            connexion_db = self.get_connexion_db()
 193            if connexion_db in ":memory:":
 194                access = "READ_WRITE"
 195            connexion_config["access_mode"] = access
 196
 197        return connexion_config
 198
 199    def get_duckdb_settings(self) -> dict:
 200        """
 201        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
 202        string.
 203        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
 204        """
 205
 206        # config
 207        config = self.get_config()
 208
 209        # duckdb settings
 210        duckdb_settings_dict = {}
 211        if config.get("duckdb_settings", None):
 212            duckdb_settings = config.get("duckdb_settings")
 213            duckdb_settings = full_path(duckdb_settings)
 214            # duckdb setting is a file
 215            if os.path.exists(duckdb_settings):
 216                with open(duckdb_settings) as json_file:
 217                    duckdb_settings_dict = yaml.safe_load(json_file)
 218            # duckdb settings is a string
 219            else:
 220                duckdb_settings_dict = json.loads(duckdb_settings)
 221
 222        return duckdb_settings_dict
 223
 224    def set_connexion_db(self) -> str:
 225        """
 226        The function `set_connexion_db` returns the appropriate database connection string based on the
 227        input format and connection type.
 228        :return: the value of the variable `connexion_db`.
 229        """
 230
 231        # Default connexion db
 232        default_connexion_db = ":memory:"
 233
 234        # Find connexion db
 235        if self.get_input_format() in ["db", "duckdb"]:
 236            connexion_db = self.get_input()
 237        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
 238            connexion_db = default_connexion_db
 239        elif self.get_connexion_type() in ["tmpfile"]:
 240            tmp_name = tempfile.mkdtemp(
 241                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
 242            )
 243            connexion_db = f"{tmp_name}/tmp.db"
 244        elif self.get_connexion_type() != "":
 245            connexion_db = self.get_connexion_type()
 246        else:
 247            connexion_db = default_connexion_db
 248
 249        # Set connexion db
 250        self.connexion_db = connexion_db
 251
 252        return connexion_db
 253
 254    def set_connexion(self, conn) -> None:
 255        """
 256        It creates a connection to the database
 257
 258        :param conn: The connection to the database. If not provided, a new connection to an in-memory
 259        database is created
 260        """
 261
 262        # Connexion db
 263        connexion_db = self.set_connexion_db()
 264
 265        # Connexion config
 266        connexion_config = self.get_connexion_config()
 267
 268        # Connexion format
 269        connexion_format = self.get_config().get("connexion_format", "duckdb")
 270        # Set connexion format
 271        self.connexion_format = connexion_format
 272
 273        # Connexion
 274        if not conn:
 275            if connexion_format in ["duckdb"]:
 276                conn = duckdb.connect(connexion_db, config=connexion_config)
 277                # duckDB settings
 278                duckdb_settings = self.get_duckdb_settings()
 279                if duckdb_settings:
 280                    for setting in duckdb_settings:
 281                        setting_value = duckdb_settings.get(setting)
 282                        if isinstance(setting_value, str):
 283                            setting_value = f"'{setting_value}'"
 284                        conn.execute(f"PRAGMA {setting}={setting_value};")
 285            elif connexion_format in ["sqlite"]:
 286                conn = sqlite3.connect(connexion_db)
 287
 288        # Set connexion
 289        self.conn = conn
 290
 291        # Log
 292        log.debug(f"connexion_format: {connexion_format}")
 293        log.debug(f"connexion_db: {connexion_db}")
 294        log.debug(f"connexion config: {connexion_config}")
 295        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
 296
 297    def set_output(self, output: str = None) -> None:
 298        """
 299        If the config file has an output key, set the output to the value of that key. Otherwise, set
 300        the output to the input
 301
 302        :param output: The name of the output file
 303        """
 304
 305        if output and not isinstance(output, str):
 306            self.output = output.name
 307        else:
 308            self.output = output
 309
 310        # Output format
 311        if self.output:
 312            output_name, output_extension = os.path.splitext(self.output)
 313            self.output_name = output_name
 314            self.output_extension = output_extension
 315            self.output_format = self.output_extension.replace(".", "")
 316        else:
 317            self.output_name = None
 318            self.output_extension = None
 319            self.output_format = None
 320
 321    def set_header(self) -> None:
 322        """
 323        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
 324        """
 325
 326        input_file = self.get_input()
 327        default_header_list = [
 328            "##fileformat=VCFv4.2",
 329            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
 330        ]
 331
 332        # Full path
 333        input_file = full_path(input_file)
 334
 335        if input_file:
 336
 337            input_format = self.get_input_format()
 338            input_compressed = self.get_input_compressed()
 339            config = self.get_config()
 340            header_list = default_header_list
 341            if input_format in [
 342                "vcf",
 343                "hdr",
 344                "tsv",
 345                "csv",
 346                "psv",
 347                "parquet",
 348                "db",
 349                "duckdb",
 350            ]:
 351                # header provided in param
 352                if config.get("header_file", None):
 353                    with open(config.get("header_file"), "rt") as f:
 354                        header_list = self.read_vcf_header(f)
 355                # within a vcf file format (header within input file itsself)
 356                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
 357                    # within a compressed vcf file format (.vcf.gz)
 358                    if input_compressed:
 359                        with bgzf.open(input_file, "rt") as f:
 360                            header_list = self.read_vcf_header(f)
 361                    # within an uncompressed vcf file format (.vcf)
 362                    else:
 363                        with open(input_file, "rt") as f:
 364                            header_list = self.read_vcf_header(f)
 365                # header provided in default external file .hdr
 366                elif os.path.exists((input_file + ".hdr")):
 367                    with open(input_file + ".hdr", "rt") as f:
 368                        header_list = self.read_vcf_header(f)
 369                else:
 370                    try:  # Try to get header info fields and file columns
 371
 372                        with tempfile.TemporaryDirectory() as tmpdir:
 373
 374                            # Create database
 375                            db_for_header = Database(database=input_file)
 376
 377                            # Get header columns for infos fields
 378                            db_header_from_columns = (
 379                                db_for_header.get_header_from_columns()
 380                            )
 381
 382                            # Get real columns in the file
 383                            db_header_columns = db_for_header.get_columns()
 384
 385                            # Write header file
 386                            header_file_tmp = os.path.join(tmpdir, "header")
 387                            f = open(header_file_tmp, "w")
 388                            vcf.Writer(f, db_header_from_columns)
 389                            f.close()
 390
 391                            # Replace #CHROM line with rel columns
 392                            header_list = db_for_header.read_header_file(
 393                                header_file=header_file_tmp
 394                            )
 395                            header_list[-1] = "\t".join(db_header_columns)
 396
 397                    except:
 398
 399                        log.warning(
 400                            f"No header for file {input_file}. Set as default VCF header"
 401                        )
 402                        header_list = default_header_list
 403
 404            else:  # try for unknown format ?
 405
 406                log.error(f"Input file format '{input_format}' not available")
 407                raise ValueError(f"Input file format '{input_format}' not available")
 408
 409            if not header_list:
 410                header_list = default_header_list
 411
 412            # header as list
 413            self.header_list = header_list
 414
 415            # header as VCF object
 416            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
 417
 418        else:
 419
 420            self.header_list = None
 421            self.header_vcf = None
 422
 423    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
 424        """
 425        > The function `get_query_to_df` takes a query as a string and returns a pandas dataframe
 426
 427        :param query: str = ""
 428        :type query: str
 429        :return: A dataframe
 430        """
 431
 432        # Connexion format
 433        connexion_format = self.get_connexion_format()
 434
 435        # Limit in query
 436        if limit:
 437            pd.set_option("display.max_rows", limit)
 438            if connexion_format in ["duckdb"]:
 439                df = (
 440                    self.conn.execute(query)
 441                    .fetch_record_batch(limit)
 442                    .read_next_batch()
 443                    .to_pandas()
 444                )
 445            elif connexion_format in ["sqlite"]:
 446                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
 447
 448        # Full query
 449        else:
 450            if connexion_format in ["duckdb"]:
 451                df = self.conn.execute(query).df()
 452            elif connexion_format in ["sqlite"]:
 453                df = pd.read_sql_query(query, self.conn)
 454
 455        return df
 456
 457    def get_overview(self) -> None:
 458        """
 459        The function prints the input, output, config, and dataframe of the current object
 460        """
 461        table_variants_from = self.get_table_variants(clause="from")
 462        sql_columns = self.get_header_columns_as_sql()
 463        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
 464        df = self.get_query_to_df(sql_query_export)
 465        log.info(
 466            "Input:  "
 467            + str(self.get_input())
 468            + " ["
 469            + str(str(self.get_input_format()))
 470            + "]"
 471        )
 472        log.info(
 473            "Output: "
 474            + str(self.get_output())
 475            + " ["
 476            + str(str(self.get_output_format()))
 477            + "]"
 478        )
 479        log.info("Config: ")
 480        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
 481            "\n"
 482        ):
 483            log.info("\t" + str(d))
 484        log.info("Param: ")
 485        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
 486            "\n"
 487        ):
 488            log.info("\t" + str(d))
 489        log.info("Sample list: " + str(self.get_header_sample_list()))
 490        log.info("Dataframe: ")
 491        for d in str(df).split("\n"):
 492            log.info("\t" + str(d))
 493
 494        # garbage collector
 495        del df
 496        gc.collect()
 497
 498        return None
 499
 500    def get_stats(self) -> dict:
 501        """
 502        The `get_stats` function calculates and returns various statistics of the current object,
 503        including information about the input file, variants, samples, header fields, quality, and
 504        SNVs/InDels.
 505        :return: a dictionary containing various statistics of the current object. The dictionary has
 506        the following structure:
 507        """
 508
 509        # Log
 510        log.info(f"Stats Calculation...")
 511
 512        # table varaints
 513        table_variants_from = self.get_table_variants()
 514
 515        # stats dict
 516        stats = {"Infos": {}}
 517
 518        ### File
 519        input_file = self.get_input()
 520        stats["Infos"]["Input file"] = input_file
 521
 522        # Header
 523        header_infos = self.get_header().infos
 524        header_formats = self.get_header().formats
 525        header_infos_list = list(header_infos)
 526        header_formats_list = list(header_formats)
 527
 528        ### Variants
 529
 530        stats["Variants"] = {}
 531
 532        # Variants by chr
 533        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
 534        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
 535        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
 536            by=["CHROM"], kind="quicksort"
 537        )
 538
 539        # Total number of variants
 540        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
 541
 542        # Calculate percentage
 543        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
 544            lambda x: (x / nb_of_variants)
 545        )
 546
 547        stats["Variants"]["Number of variants by chromosome"] = (
 548            nb_of_variants_by_chrom.to_dict(orient="index")
 549        )
 550
 551        stats["Infos"]["Number of variants"] = int(nb_of_variants)
 552
 553        ### Samples
 554
 555        # Init
 556        samples = {}
 557        nb_of_samples = 0
 558
 559        # Check Samples
 560        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
 561            log.debug(f"Check samples...")
 562            for sample in self.get_header_sample_list():
 563                sql_query_samples = f"""
 564                    SELECT  '{sample}' as sample,
 565                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
 566                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
 567                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
 568                    FROM {table_variants_from}
 569                    WHERE (
 570                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
 571                        AND
 572                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
 573                      )
 574                    GROUP BY genotype
 575                    """
 576                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
 577                sample_genotype_count = sql_query_genotype_df["count"].sum()
 578                if len(sql_query_genotype_df):
 579                    nb_of_samples += 1
 580                    samples[f"{sample} - {sample_genotype_count} variants"] = (
 581                        sql_query_genotype_df.to_dict(orient="index")
 582                    )
 583
 584            stats["Samples"] = samples
 585            stats["Infos"]["Number of samples"] = nb_of_samples
 586
 587        # #
 588        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
 589        #     stats["Infos"]["Number of samples"] = nb_of_samples
 590        # elif nb_of_samples:
 591        #     stats["Infos"]["Number of samples"] = "not a VCF format"
 592
 593        ### INFO and FORMAT fields
 594        header_types_df = {}
 595        header_types_list = {
 596            "List of INFO fields": header_infos,
 597            "List of FORMAT fields": header_formats,
 598        }
 599        i = 0
 600        for header_type in header_types_list:
 601
 602            header_type_infos = header_types_list.get(header_type)
 603            header_infos_dict = {}
 604
 605            for info in header_type_infos:
 606
 607                i += 1
 608                header_infos_dict[i] = {}
 609
 610                # ID
 611                header_infos_dict[i]["id"] = info
 612
 613                # num
 614                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
 615                if header_type_infos[info].num in genotype_map.keys():
 616                    header_infos_dict[i]["Number"] = genotype_map.get(
 617                        header_type_infos[info].num
 618                    )
 619                else:
 620                    header_infos_dict[i]["Number"] = header_type_infos[info].num
 621
 622                # type
 623                if header_type_infos[info].type:
 624                    header_infos_dict[i]["Type"] = header_type_infos[info].type
 625                else:
 626                    header_infos_dict[i]["Type"] = "."
 627
 628                # desc
 629                if header_type_infos[info].desc != None:
 630                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
 631                else:
 632                    header_infos_dict[i]["Description"] = ""
 633
 634            if len(header_infos_dict):
 635                header_types_df[header_type] = pd.DataFrame.from_dict(
 636                    header_infos_dict, orient="index"
 637                ).to_dict(orient="index")
 638
 639        # Stats
 640        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
 641        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
 642        stats["Header"] = header_types_df
 643
 644        ### QUAL
 645        if "QUAL" in self.get_header_columns():
 646            sql_query_qual = f"""
 647                    SELECT
 648                        avg(CAST(QUAL AS INTEGER)) AS Average,
 649                        min(CAST(QUAL AS INTEGER)) AS Minimum,
 650                        max(CAST(QUAL AS INTEGER)) AS Maximum,
 651                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
 652                        median(CAST(QUAL AS INTEGER)) AS Median,
 653                        variance(CAST(QUAL AS INTEGER)) AS Variance
 654                    FROM {table_variants_from}
 655                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
 656                    """
 657
 658            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
 659            stats["Quality"] = {"Stats": qual}
 660
 661        ### SNV and InDel
 662
 663        sql_query_snv = f"""
 664            
 665            SELECT Type, count FROM (
 666
 667                    SELECT
 668                        'Total' AS Type,
 669                        count(*) AS count
 670                    FROM {table_variants_from}
 671
 672                    UNION
 673
 674                    SELECT
 675                        'MNV' AS Type,
 676                        count(*) AS count
 677                    FROM {table_variants_from}
 678                    WHERE len(REF) > 1 AND len(ALT) > 1
 679                    AND len(REF) = len(ALT)
 680
 681                    UNION
 682
 683                    SELECT
 684                        'InDel' AS Type,
 685                        count(*) AS count
 686                    FROM {table_variants_from}
 687                    WHERE len(REF) > 1 OR len(ALT) > 1
 688                    AND len(REF) != len(ALT)
 689                    
 690                    UNION
 691
 692                    SELECT
 693                        'SNV' AS Type,
 694                        count(*) AS count
 695                    FROM {table_variants_from}
 696                    WHERE len(REF) = 1 AND len(ALT) = 1
 697
 698                )
 699
 700            ORDER BY count DESC
 701
 702                """
 703        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
 704
 705        sql_query_snv_substitution = f"""
 706                SELECT
 707                    concat(REF, '>', ALT) AS 'Substitution',
 708                    count(*) AS count
 709                FROM {table_variants_from}
 710                WHERE len(REF) = 1 AND len(ALT) = 1
 711                GROUP BY REF, ALT
 712                ORDER BY count(*) DESC
 713                """
 714        snv_substitution = (
 715            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
 716        )
 717        stats["Variants"]["Counts"] = snv_indel
 718        stats["Variants"]["Substitutions"] = snv_substitution
 719
 720        return stats
 721
 722    def stats_to_file(self, file: str = None) -> str:
 723        """
 724        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
 725        into a JSON object, and writes the JSON object to the specified file.
 726
 727        :param file: The `file` parameter is a string that represents the file path where the JSON data
 728        will be written
 729        :type file: str
 730        :return: the name of the file that was written to.
 731        """
 732
 733        # Get stats
 734        stats = self.get_stats()
 735
 736        # Serializing json
 737        json_object = json.dumps(stats, indent=4)
 738
 739        # Writing to sample.json
 740        with open(file, "w") as outfile:
 741            outfile.write(json_object)
 742
 743        return file
 744
 745    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
 746        """
 747        The `print_stats` function generates a markdown file and prints the statistics contained in a
 748        JSON file in a formatted manner.
 749
 750        :param output_file: The `output_file` parameter is a string that specifies the path and filename
 751        of the output file where the stats will be printed in Markdown format. If no `output_file` is
 752        provided, a temporary directory will be created and the stats will be saved in a file named
 753        "stats.md" within that
 754        :type output_file: str
 755        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
 756        file where the statistics will be saved. If no value is provided, a temporary directory will be
 757        created and a default file name "stats.json" will be used
 758        :type json_file: str
 759        :return: The function `print_stats` does not return any value. It has a return type annotation
 760        of `None`.
 761        """
 762
 763        # Full path
 764        output_file = full_path(output_file)
 765        json_file = full_path(json_file)
 766
 767        with tempfile.TemporaryDirectory() as tmpdir:
 768
 769            # Files
 770            if not output_file:
 771                output_file = os.path.join(tmpdir, "stats.md")
 772            if not json_file:
 773                json_file = os.path.join(tmpdir, "stats.json")
 774
 775            # Create folders
 776            if not os.path.exists(os.path.dirname(output_file)):
 777                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
 778            if not os.path.exists(os.path.dirname(json_file)):
 779                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
 780
 781            # Create stats JSON file
 782            stats_file = self.stats_to_file(file=json_file)
 783
 784            # Print stats file
 785            with open(stats_file) as f:
 786                stats = yaml.safe_load(f)
 787
 788            # Output
 789            output_title = []
 790            output_index = []
 791            output = []
 792
 793            # Title
 794            output_title.append("# HOWARD Stats")
 795
 796            # Index
 797            output_index.append("## Index")
 798
 799            # Process sections
 800            for section in stats:
 801                infos = stats.get(section)
 802                section_link = "#" + section.lower().replace(" ", "-")
 803                output.append(f"## {section}")
 804                output_index.append(f"- [{section}]({section_link})")
 805
 806                if len(infos):
 807                    for info in infos:
 808                        try:
 809                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
 810                            is_df = True
 811                        except:
 812                            try:
 813                                df = pd.DataFrame.from_dict(
 814                                    json.loads((infos.get(info))), orient="index"
 815                                )
 816                                is_df = True
 817                            except:
 818                                is_df = False
 819                        if is_df:
 820                            output.append(f"### {info}")
 821                            info_link = "#" + info.lower().replace(" ", "-")
 822                            output_index.append(f"   - [{info}]({info_link})")
 823                            output.append(f"{df.to_markdown(index=False)}")
 824                        else:
 825                            output.append(f"- {info}: {infos.get(info)}")
 826                else:
 827                    output.append(f"NA")
 828
 829            # Write stats in markdown file
 830            with open(output_file, "w") as fp:
 831                for item in output_title:
 832                    fp.write("%s\n" % item)
 833                for item in output_index:
 834                    fp.write("%s\n" % item)
 835                for item in output:
 836                    fp.write("%s\n" % item)
 837
 838            # Output stats in markdown
 839            print("")
 840            print("\n\n".join(output_title))
 841            print("")
 842            print("\n\n".join(output))
 843            print("")
 844
 845        return None
 846
 847    def get_input(self) -> str:
 848        """
 849        It returns the value of the input variable.
 850        :return: The input is being returned.
 851        """
 852        return self.input
 853
 854    def get_input_format(self, input_file: str = None) -> str:
 855        """
 856        It returns the format of the input variable.
 857        :return: The format is being returned.
 858        """
 859        if not input_file:
 860            input_file = self.get_input()
 861        input_format = get_file_format(input_file)
 862        return input_format
 863
 864    def get_input_compressed(self, input_file: str = None) -> str:
 865        """
 866        It returns the format of the input variable.
 867        :return: The format is being returned.
 868        """
 869        if not input_file:
 870            input_file = self.get_input()
 871        input_compressed = get_file_compressed(input_file)
 872        return input_compressed
 873
 874    def get_output(self) -> str:
 875        """
 876        It returns the output of the neuron.
 877        :return: The output of the neural network.
 878        """
 879        return self.output
 880
 881    def get_output_format(self, output_file: str = None) -> str:
 882        """
 883        It returns the format of the input variable.
 884        :return: The format is being returned.
 885        """
 886        if not output_file:
 887            output_file = self.get_output()
 888        output_format = get_file_format(output_file)
 889
 890        return output_format
 891
 892    def get_config(self) -> dict:
 893        """
 894        It returns the config
 895        :return: The config variable is being returned.
 896        """
 897        return self.config
 898
 899    def get_param(self) -> dict:
 900        """
 901        It returns the param
 902        :return: The param variable is being returned.
 903        """
 904        return self.param
 905
 906    def get_connexion_db(self) -> str:
 907        """
 908        It returns the connexion_db attribute of the object
 909        :return: The connexion_db is being returned.
 910        """
 911        return self.connexion_db
 912
 913    def get_prefix(self) -> str:
 914        """
 915        It returns the prefix of the object.
 916        :return: The prefix is being returned.
 917        """
 918        return self.prefix
 919
 920    def get_table_variants(self, clause: str = "select") -> str:
 921        """
 922        This function returns the table_variants attribute of the object
 923
 924        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 925        defaults to select (optional)
 926        :return: The table_variants attribute of the object.
 927        """
 928
 929        # Access
 930        access = self.get_config().get("access", None)
 931
 932        # Clauses "select", "where", "update"
 933        if clause in ["select", "where", "update"]:
 934            table_variants = self.table_variants
 935        # Clause "from"
 936        elif clause in ["from"]:
 937            # For Read Only
 938            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 939                input_file = self.get_input()
 940                table_variants = f"'{input_file}' as variants"
 941            # For Read Write
 942            else:
 943                table_variants = f"{self.table_variants} as variants"
 944        else:
 945            table_variants = self.table_variants
 946        return table_variants
 947
 948    def get_tmp_dir(self) -> str:
 949        """
 950        The function `get_tmp_dir` returns the temporary directory path based on configuration
 951        parameters or a default path.
 952        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 953        configuration, parameters, and a default value of "/tmp".
 954        """
 955
 956        return get_tmp(
 957            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 958        )
 959
 960    def get_connexion_type(self) -> str:
 961        """
 962        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 963
 964        :return: The connexion type is being returned.
 965        """
 966        return self.get_config().get("connexion_type", "memory")
 967
 968    def get_connexion(self):
 969        """
 970        It returns the connection object
 971
 972        :return: The connection object.
 973        """
 974        return self.conn
 975
 976    def close_connexion(self) -> None:
 977        """
 978        This function closes the connection to the database.
 979        :return: The connection is being closed.
 980        """
 981        return self.conn.close()
 982
 983    def get_header(self, type: str = "vcf"):
 984        """
 985        This function returns the header of the VCF file as a list of strings
 986
 987        :param type: the type of header you want to get, defaults to vcf (optional)
 988        :return: The header of the vcf file.
 989        """
 990
 991        if self.header_vcf:
 992            if type == "vcf":
 993                return self.header_vcf
 994            elif type == "list":
 995                return self.header_list
 996        else:
 997            if type == "vcf":
 998                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 999                return header
1000            elif type == "list":
1001                return vcf_required
1002
1003    def get_header_length(self, file: str = None) -> int:
1004        """
1005        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1006        line.
1007
1008        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1009        header file. If this argument is provided, the function will read the header from the specified
1010        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1011        :type file: str
1012        :return: the length of the header list, excluding the #CHROM line.
1013        """
1014
1015        if file:
1016            return len(self.read_vcf_header_file(file=file)) - 1
1017        elif self.get_header(type="list"):
1018            return len(self.get_header(type="list")) - 1
1019        else:
1020            return 0
1021
1022    def get_header_columns(self) -> str:
1023        """
1024        This function returns the header list of a VCF
1025
1026        :return: The length of the header list.
1027        """
1028        if self.get_header():
1029            return self.get_header(type="list")[-1]
1030        else:
1031            return ""
1032
1033    def get_header_columns_as_list(self) -> list:
1034        """
1035        This function returns the header list of a VCF
1036
1037        :return: The length of the header list.
1038        """
1039        if self.get_header():
1040            return self.get_header_columns().strip().split("\t")
1041        else:
1042            return []
1043
1044    def get_header_columns_as_sql(self) -> str:
1045        """
1046        This function retruns header length (without #CHROM line)
1047
1048        :return: The length of the header list.
1049        """
1050        sql_column_list = []
1051        for col in self.get_header_columns_as_list():
1052            sql_column_list.append(f'"{col}"')
1053        return ",".join(sql_column_list)
1054
1055    def get_header_sample_list(self) -> list:
1056        """
1057        This function retruns header length (without #CHROM line)
1058
1059        :return: The length of the header list.
1060        """
1061        return self.header_vcf.samples
1062
1063    def get_verbose(self) -> bool:
1064        """
1065        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1066        exist
1067
1068        :return: The value of the key "verbose" in the config dictionary.
1069        """
1070        return self.get_config().get("verbose", False)
1071
1072    def get_connexion_format(self) -> str:
1073        """
1074        It returns the connexion format of the object.
1075        :return: The connexion_format is being returned.
1076        """
1077        connexion_format = self.connexion_format
1078        if connexion_format not in ["duckdb", "sqlite"]:
1079            log.error(f"Unknown connexion format {connexion_format}")
1080            raise ValueError(f"Unknown connexion format {connexion_format}")
1081        else:
1082            return connexion_format
1083
1084    def insert_file_to_table(
1085        self,
1086        file,
1087        columns: str,
1088        header_len: int = 0,
1089        sep: str = "\t",
1090        chunksize: int = 1000000,
1091    ) -> None:
1092        """
1093        The function reads a file in chunks, and inserts each chunk into a table
1094
1095        :param file: the file to be loaded
1096        :param columns: a string of the column names separated by commas
1097        :param header_len: the number of lines to skip at the beginning of the file, defaults to 0
1098        (optional)
1099        :param sep: the separator used in the file, defaults to \t (optional)
1100        :param chunksize: The number of rows to read in at a time, defaults to 1000000 (optional)
1101        """
1102
1103        # Config
1104        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1105        connexion_format = self.get_connexion_format()
1106
1107        log.debug("chunksize: " + str(chunksize))
1108
1109        if chunksize:
1110            for chunk in pd.read_csv(
1111                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1112            ):
1113                if connexion_format in ["duckdb"]:
1114                    sql_insert_into = (
1115                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1116                    )
1117                    self.conn.execute(sql_insert_into)
1118                elif connexion_format in ["sqlite"]:
1119                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
1120
1121    def load_data(
1122        self,
1123        input_file: str = None,
1124        drop_variants_table: bool = False,
1125        sample_size: int = 20480,
1126    ) -> None:
1127        """
1128        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1129        table before loading the data and specify a sample size.
1130
1131        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1132        table
1133        :type input_file: str
1134        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1135        determines whether the variants table should be dropped before loading the data. If set to
1136        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1137        not be dropped, defaults to False
1138        :type drop_variants_table: bool (optional)
1139        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1140        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1141        20480
1142        :type sample_size: int (optional)
1143        """
1144
1145        log.info("Loading...")
1146
1147        # change input file
1148        if input_file:
1149            self.set_input(input_file)
1150            self.set_header()
1151
1152        # drop variants table
1153        if drop_variants_table:
1154            self.drop_variants_table()
1155
1156        # get table variants
1157        table_variants = self.get_table_variants()
1158
1159        # Access
1160        access = self.get_config().get("access", None)
1161        log.debug(f"access: {access}")
1162
1163        # Input format and compress
1164        input_format = self.get_input_format()
1165        input_compressed = self.get_input_compressed()
1166        log.debug(f"input_format: {input_format}")
1167        log.debug(f"input_compressed: {input_compressed}")
1168
1169        # input_compressed_format
1170        if input_compressed:
1171            input_compressed_format = "gzip"
1172        else:
1173            input_compressed_format = "none"
1174        log.debug(f"input_compressed_format: {input_compressed_format}")
1175
1176        # Connexion format
1177        connexion_format = self.get_connexion_format()
1178
1179        # Sample size
1180        if not sample_size:
1181            sample_size = -1
1182        log.debug(f"sample_size: {sample_size}")
1183
1184        # Load data
1185        log.debug(f"Load Data from {input_format}")
1186
1187        # DuckDB connexion
1188        if connexion_format in ["duckdb"]:
1189
1190            # Database already exists
1191            if self.input_format in ["db", "duckdb"]:
1192
1193                if connexion_format in ["duckdb"]:
1194                    log.debug(f"Input file format '{self.input_format}' duckDB")
1195                else:
1196                    log.error(
1197                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1198                    )
1199                    raise ValueError(
1200                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1201                    )
1202
1203            # Load from existing database format
1204            else:
1205
1206                try:
1207                    # Create Table or View
1208                    database = Database(database=self.input)
1209                    sql_from = database.get_sql_from(sample_size=sample_size)
1210
1211                    if access in ["RO"]:
1212                        sql_load = (
1213                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1214                        )
1215                    else:
1216                        sql_load = (
1217                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1218                        )
1219                    self.conn.execute(sql_load)
1220
1221                except:
1222                    # Format not available
1223                    log.error(f"Input file format '{self.input_format}' not available")
1224                    raise ValueError(
1225                        f"Input file format '{self.input_format}' not available"
1226                    )
1227
1228        # SQLite connexion
1229        elif connexion_format in ["sqlite"] and input_format in [
1230            "vcf",
1231            "tsv",
1232            "csv",
1233            "psv",
1234        ]:
1235
1236            # Main structure
1237            structure = {
1238                "#CHROM": "VARCHAR",
1239                "POS": "INTEGER",
1240                "ID": "VARCHAR",
1241                "REF": "VARCHAR",
1242                "ALT": "VARCHAR",
1243                "QUAL": "VARCHAR",
1244                "FILTER": "VARCHAR",
1245                "INFO": "VARCHAR",
1246            }
1247
1248            # Strcuture with samples
1249            structure_complete = structure
1250            if self.get_header_sample_list():
1251                structure["FORMAT"] = "VARCHAR"
1252                for sample in self.get_header_sample_list():
1253                    structure_complete[sample] = "VARCHAR"
1254
1255            # Columns list for create and insert
1256            sql_create_table_columns = []
1257            sql_create_table_columns_list = []
1258            for column in structure_complete:
1259                column_type = structure_complete[column]
1260                sql_create_table_columns.append(
1261                    f'"{column}" {column_type} default NULL'
1262                )
1263                sql_create_table_columns_list.append(f'"{column}"')
1264
1265            # Create database
1266            log.debug(f"Create Table {table_variants}")
1267            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1268            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1269            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1270            self.conn.execute(sql_create_table)
1271
1272            # chunksize define length of file chunk load file
1273            chunksize = 100000
1274
1275            # delimiter
1276            delimiter = file_format_delimiters.get(input_format, "\t")
1277
1278            # Load the input file
1279            with open(self.input, "rt") as input_file:
1280
1281                # Use the appropriate file handler based on the input format
1282                if input_compressed:
1283                    input_file = bgzf.open(self.input, "rt")
1284                if input_format in ["vcf"]:
1285                    header_len = self.get_header_length()
1286                else:
1287                    header_len = 0
1288
1289                # Insert the file contents into a table
1290                self.insert_file_to_table(
1291                    input_file,
1292                    columns=sql_create_table_columns_list_sql,
1293                    header_len=header_len,
1294                    sep=delimiter,
1295                    chunksize=chunksize,
1296                )
1297
1298        else:
1299            log.error(
1300                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1301            )
1302            raise ValueError(
1303                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1304            )
1305
1306        # Explode INFOS fields into table fields
1307        if self.get_explode_infos():
1308            self.explode_infos(
1309                prefix=self.get_explode_infos_prefix(),
1310                fields=self.get_explode_infos_fields(),
1311                force=True,
1312            )
1313
1314        # Create index after insertion
1315        self.create_indexes()
1316
1317    def get_explode_infos(self) -> bool:
1318        """
1319        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1320        to False if it is not set.
1321        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1322        value. If the parameter is not present, it will return False.
1323        """
1324
1325        return self.get_param().get("explode", {}).get("explode_infos", False)
1326
1327    def get_explode_infos_fields(
1328        self,
1329        explode_infos_fields: str = None,
1330        remove_fields_not_in_header: bool = False,
1331    ) -> list:
1332        """
1333        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1334        the input parameter `explode_infos_fields`.
1335
1336        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1337        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1338        comma-separated list of field names to explode
1339        :type explode_infos_fields: str
1340        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1341        flag that determines whether to remove fields that are not present in the header. If it is set
1342        to `True`, any field that is not in the header will be excluded from the list of exploded
1343        information fields. If it is set to `, defaults to False
1344        :type remove_fields_not_in_header: bool (optional)
1345        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1346        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1347        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1348        Otherwise, it returns a list of exploded information fields after removing any spaces and
1349        splitting the string by commas.
1350        """
1351
1352        # If no fields, get it in param
1353        if not explode_infos_fields:
1354            explode_infos_fields = (
1355                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1356            )
1357
1358        # If no fields, defined as all fields in header using keyword
1359        if not explode_infos_fields:
1360            explode_infos_fields = "*"
1361
1362        # If fields list not empty
1363        if explode_infos_fields:
1364
1365            # Input fields list
1366            if isinstance(explode_infos_fields, str):
1367                fields_input = explode_infos_fields.split(",")
1368            elif isinstance(explode_infos_fields, list):
1369                fields_input = explode_infos_fields
1370            else:
1371                fields_input = []
1372
1373            # Fields list without * keyword
1374            fields_without_all = fields_input.copy()
1375            if "*".casefold() in (item.casefold() for item in fields_without_all):
1376                fields_without_all.remove("*")
1377
1378            # Fields in header
1379            fields_in_header = sorted(list(set(self.get_header().infos)))
1380
1381            # Construct list of fields
1382            fields_output = []
1383            for field in fields_input:
1384
1385                # Strip field
1386                field = field.strip()
1387
1388                # format keyword * in regex
1389                if field.upper() in ["*"]:
1390                    field = ".*"
1391
1392                # Find all fields with pattern
1393                r = re.compile(field)
1394                fields_search = sorted(list(filter(r.match, fields_in_header)))
1395
1396                # Remove fields input from search
1397                if fields_search != [field]:
1398                    fields_search = sorted(
1399                        list(set(fields_search).difference(fields_input))
1400                    )
1401
1402                # If field is not in header (avoid not well formatted header)
1403                if not fields_search and not remove_fields_not_in_header:
1404                    fields_search = [field]
1405
1406                # Add found fields
1407                for new_field in fields_search:
1408                    # Add field, if not already exists, and if it is in header (if asked)
1409                    if (
1410                        new_field not in fields_output
1411                        and (
1412                            not remove_fields_not_in_header
1413                            or new_field in fields_in_header
1414                        )
1415                        and new_field not in [".*"]
1416                    ):
1417                        fields_output.append(new_field)
1418
1419            return fields_output
1420
1421        else:
1422
1423            return []
1424
1425    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1426        """
1427        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1428        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1429        not provided.
1430
1431        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1432        prefix to be used for exploding or expanding information
1433        :type explode_infos_prefix: str
1434        :return: the value of the variable `explode_infos_prefix`.
1435        """
1436
1437        if not explode_infos_prefix:
1438            explode_infos_prefix = (
1439                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1440            )
1441
1442        return explode_infos_prefix
1443
1444    def add_column(
1445        self,
1446        table_name,
1447        column_name,
1448        column_type,
1449        default_value=None,
1450        drop: bool = False,
1451    ) -> dict:
1452        """
1453        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1454        doesn't already exist.
1455
1456        :param table_name: The name of the table to which you want to add a column
1457        :param column_name: The parameter "column_name" is the name of the column that you want to add
1458        to the table
1459        :param column_type: The `column_type` parameter specifies the data type of the column that you
1460        want to add to the table. It should be a string that represents the desired data type, such as
1461        "INTEGER", "TEXT", "REAL", etc
1462        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1463        default value for the newly added column. If a default value is provided, it will be assigned to
1464        the column for any existing rows that do not have a value for that column
1465        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1466        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1467        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1468        to False
1469        :type drop: bool (optional)
1470        :return: a boolean value indicating whether the column was successfully added to the table.
1471        """
1472
1473        # added
1474        added = False
1475        dropped = False
1476
1477        # Check if the column already exists in the table
1478        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1479        columns = self.get_query_to_df(query).columns.tolist()
1480        if column_name in columns:
1481            log.debug(
1482                f"The {column_name} column already exists in the {table_name} table"
1483            )
1484            if drop:
1485                self.drop_column(table_name=table_name, column_name=column_name)
1486                dropped = True
1487            else:
1488                return None
1489        else:
1490            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1491
1492        # Add column in table
1493        add_column_query = (
1494            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1495        )
1496        if default_value is not None:
1497            add_column_query += f" DEFAULT {default_value}"
1498        self.execute_query(add_column_query)
1499        added = not dropped
1500        log.debug(
1501            f"The {column_name} column was successfully added to the {table_name} table"
1502        )
1503
1504        if added:
1505            added_column = {
1506                "table_name": table_name,
1507                "column_name": column_name,
1508                "column_type": column_type,
1509                "default_value": default_value,
1510            }
1511        else:
1512            added_column = None
1513
1514        return added_column
1515
1516    def drop_column(
1517        self, column: dict = None, table_name: str = None, column_name: str = None
1518    ) -> bool:
1519        """
1520        The `drop_column` function drops a specified column from a given table in a database and returns
1521        True if the column was successfully dropped, and False if the column does not exist in the
1522        table.
1523
1524        :param column: The `column` parameter is a dictionary that contains information about the column
1525        you want to drop. It has two keys:
1526        :type column: dict
1527        :param table_name: The `table_name` parameter is the name of the table from which you want to
1528        drop a column
1529        :type table_name: str
1530        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1531        from the table
1532        :type column_name: str
1533        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1534        and False if the column does not exist in the table.
1535        """
1536
1537        # Find column infos
1538        if column:
1539            if isinstance(column, dict):
1540                table_name = column.get("table_name", None)
1541                column_name = column.get("column_name", None)
1542            elif isinstance(column, str):
1543                table_name = self.get_table_variants()
1544                column_name = column
1545            else:
1546                table_name = None
1547                column_name = None
1548
1549        if not table_name and not column_name:
1550            return False
1551
1552        # Removed
1553        removed = False
1554
1555        # Check if the column already exists in the table
1556        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1557        columns = self.get_query_to_df(query).columns.tolist()
1558        if column_name in columns:
1559            log.debug(f"The {column_name} column exists in the {table_name} table")
1560        else:
1561            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1562            return False
1563
1564        # Add column in table # ALTER TABLE integers DROP k
1565        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1566        self.execute_query(add_column_query)
1567        removed = True
1568        log.debug(
1569            f"The {column_name} column was successfully dropped to the {table_name} table"
1570        )
1571
1572        return removed
1573
1574    def explode_infos(
1575        self,
1576        prefix: str = None,
1577        create_index: bool = False,
1578        fields: list = None,
1579        force: bool = False,
1580        proccess_all_fields_together: bool = False,
1581    ) -> list:
1582        """
1583        The `explode_infos` function takes a VCF file and explodes the INFO fields into individual
1584        columns, returning a list of added columns.
1585
1586        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1587        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1588        `self.get_explode_infos_prefix()` as the prefix
1589        :type prefix: str
1590        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1591        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1592        `False`, indexes will not be created. The default value is `False`, defaults to False
1593        :type create_index: bool (optional)
1594        :param fields: The `fields` parameter is a list of INFO fields that you want to explode into
1595        individual columns. If this parameter is not provided, all INFO fields will be exploded
1596        :type fields: list
1597        :param force: The `force` parameter is a boolean flag that determines whether to drop and
1598        recreate the column if it already exists in the table. If `force` is set to `True`, the column
1599        will be dropped and recreated. If `force` is set to `False`, the column will not be dropped,
1600        defaults to False
1601        :type force: bool (optional)
1602        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1603        flag that determines whether to process all the INFO fields together or individually. If set to
1604        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1605        be processed individually, defaults to False
1606        :type proccess_all_fields_together: bool (optional)
1607        :return: The function `explode_infos` returns a list of added columns.
1608        """
1609
1610        # drop indexes
1611        self.drop_indexes()
1612
1613        # connexion format
1614        connexion_format = self.get_connexion_format()
1615
1616        # Access
1617        access = self.get_config().get("access", None)
1618
1619        # Added columns
1620        added_columns = []
1621
1622        if access not in ["RO"]:
1623
1624            # prefix
1625            if prefix in [None, True] or not isinstance(prefix, str):
1626                if self.get_explode_infos_prefix() not in [None, True]:
1627                    prefix = self.get_explode_infos_prefix()
1628                else:
1629                    prefix = "INFO/"
1630
1631            # table variants
1632            table_variants = self.get_table_variants(clause="select")
1633
1634            # extra infos
1635            try:
1636                extra_infos = self.get_extra_infos()
1637            except:
1638                extra_infos = []
1639
1640            # Header infos
1641            header_infos = self.get_header().infos
1642
1643            log.debug(
1644                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1645            )
1646
1647            sql_info_alter_table_array = []
1648
1649            # Info fields to check
1650            fields_list = list(header_infos)
1651            if fields:
1652                fields_list += fields
1653            fields_list = set(fields_list)
1654
1655            # If no fields
1656            if not fields:
1657                fields = []
1658
1659            # Translate fields if patterns
1660            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1661
1662            for info in fields:
1663
1664                info_id_sql = prefix + info
1665
1666                if (
1667                    info in fields_list
1668                    or prefix + info in fields_list
1669                    or info in extra_infos
1670                ):
1671
1672                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1673
1674                    if info in header_infos:
1675                        info_type = header_infos[info].type
1676                        info_num = header_infos[info].num
1677                    else:
1678                        info_type = "String"
1679                        info_num = 0
1680
1681                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1682                    if info_num != 1:
1683                        type_sql = "VARCHAR"
1684
1685                    # Add field
1686                    added_column = self.add_column(
1687                        table_name=table_variants,
1688                        column_name=info_id_sql,
1689                        column_type=type_sql,
1690                        default_value="null",
1691                        drop=force,
1692                    )
1693
1694                    if added_column:
1695                        added_columns.append(added_column)
1696
1697                    if added_column or force:
1698
1699                        # add field to index
1700                        self.index_additionnal_fields.append(info_id_sql)
1701
1702                        # Update field array
1703                        if connexion_format in ["duckdb"]:
1704                            update_info_field = f"""
1705                            "{info_id_sql}" =
1706                                CASE
1707                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1708                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1709                                END
1710                            """
1711                        elif connexion_format in ["sqlite"]:
1712                            update_info_field = f"""
1713                                "{info_id_sql}" =
1714                                    CASE
1715                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1716                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1717                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1718                                    END
1719                            """
1720
1721                        sql_info_alter_table_array.append(update_info_field)
1722
1723            if sql_info_alter_table_array:
1724
1725                # By chromosomes
1726                try:
1727                    chromosomes_list = list(
1728                        self.get_query_to_df(
1729                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1730                        )["#CHROM"]
1731                    )
1732                except:
1733                    chromosomes_list = [None]
1734
1735                for chrom in chromosomes_list:
1736                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1737
1738                    # Where clause
1739                    where_clause = ""
1740                    if chrom and len(chromosomes_list) > 1:
1741                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1742
1743                    # Update table
1744                    if proccess_all_fields_together:
1745                        sql_info_alter_table_array_join = ", ".join(
1746                            sql_info_alter_table_array
1747                        )
1748                        if sql_info_alter_table_array_join:
1749                            sql_info_alter_table = f"""
1750                                UPDATE {table_variants}
1751                                SET {sql_info_alter_table_array_join}
1752                                {where_clause}
1753                                """
1754                            log.debug(
1755                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1756                            )
1757                            # log.debug(sql_info_alter_table)
1758                            self.conn.execute(sql_info_alter_table)
1759                    else:
1760                        sql_info_alter_num = 0
1761                        for sql_info_alter in sql_info_alter_table_array:
1762                            sql_info_alter_num += 1
1763                            sql_info_alter_table = f"""
1764                                UPDATE {table_variants}
1765                                SET {sql_info_alter}
1766                                {where_clause}
1767                                """
1768                            log.debug(
1769                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1770                            )
1771                            # log.debug(sql_info_alter_table)
1772                            self.conn.execute(sql_info_alter_table)
1773
1774        # create indexes
1775        if create_index:
1776            self.create_indexes()
1777
1778        return added_columns
1779
1780    def create_indexes(self) -> None:
1781        """
1782        Create indexes on the table after insertion
1783        """
1784
1785        # Access
1786        access = self.get_config().get("access", None)
1787
1788        # get table variants
1789        table_variants = self.get_table_variants("FROM")
1790
1791        if self.get_indexing() and access not in ["RO"]:
1792            # Create index
1793            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1794            self.conn.execute(sql_create_table_index)
1795            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1796            self.conn.execute(sql_create_table_index)
1797            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1798            self.conn.execute(sql_create_table_index)
1799            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1800            self.conn.execute(sql_create_table_index)
1801            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1802            self.conn.execute(sql_create_table_index)
1803            for field in self.index_additionnal_fields:
1804                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1805                self.conn.execute(sql_create_table_index)
1806
1807    def drop_indexes(self) -> None:
1808        """
1809        Create indexes on the table after insertion
1810        """
1811
1812        # Access
1813        access = self.get_config().get("access", None)
1814
1815        # get table variants
1816        table_variants = self.get_table_variants("FROM")
1817
1818        # Get database format
1819        connexion_format = self.get_connexion_format()
1820
1821        if access not in ["RO"]:
1822            if connexion_format in ["duckdb"]:
1823                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1824            elif connexion_format in ["sqlite"]:
1825                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1826
1827            list_indexes = self.conn.execute(sql_list_indexes)
1828            index_names = [row[0] for row in list_indexes.fetchall()]
1829            for index in index_names:
1830                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1831                self.conn.execute(sql_drop_table_index)
1832
1833    def read_vcf_header(self, f) -> list:
1834        """
1835        It reads the header of a VCF file and returns a list of the header lines
1836
1837        :param f: the file object
1838        :return: The header lines of the VCF file.
1839        """
1840
1841        header_list = []
1842        for line in f:
1843            header_list.append(line)
1844            if line.startswith("#CHROM"):
1845                break
1846        return header_list
1847
1848    def read_vcf_header_file(self, file: str = None) -> list:
1849        """
1850        The function `read_vcf_header_file` reads the header of a VCF file, either from a compressed or
1851        uncompressed file.
1852
1853        :param file: The `file` parameter is a string that represents the path to the VCF header file
1854        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1855        default to `None`
1856        :type file: str
1857        :param compressed: The `compressed` parameter is a boolean flag that indicates whether the VCF
1858        file is compressed or not. If `compressed` is set to `True`, it means that the VCF file is
1859        compressed using the BGZF compression format. If `compressed` is set to `False`, it means that,
1860        defaults to False
1861        :type compressed: bool (optional)
1862        :return: a list.
1863        """
1864
1865        if self.get_input_compressed(input_file=file):
1866            with bgzf.open(file, "rt") as f:
1867                return self.read_vcf_header(f=f)
1868        else:
1869            with open(file, "rt") as f:
1870                return self.read_vcf_header(f=f)
1871
1872    def execute_query(self, query: str):
1873        """
1874        It takes a query as an argument, executes it, and returns the results
1875
1876        :param query: The query to be executed
1877        :return: The result of the query is being returned.
1878        """
1879        if query:
1880            return self.conn.execute(query)  # .fetchall()
1881        else:
1882            return None
1883
1884    def export_output(
1885        self,
1886        output_file: str | None = None,
1887        output_header: str | None = None,
1888        export_header: bool = True,
1889        query: str | None = None,
1890        parquet_partitions: list | None = None,
1891        chunk_size: int | None = None,
1892        threads: int | None = None,
1893        sort: bool = False,
1894        index: bool = False,
1895        order_by: str | None = None,
1896    ) -> bool:
1897        """
1898        The `export_output` function exports data from a VCF file to a specified output file in various
1899        formats, including VCF, CSV, TSV, PSV, and Parquet.
1900
1901        :param output_file: The `output_file` parameter is a string that specifies the name of the
1902        output file to be generated by the function. This is where the exported data will be saved
1903        :type output_file: str
1904        :param output_header: The `output_header` parameter is a string that specifies the name of the
1905        file where the header of the VCF file will be exported. If this parameter is not provided, the
1906        header will be exported to a file with the same name as the `output_file` parameter, but with
1907        the extension "
1908        :type output_header: str
1909        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1910        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1911        True, the header will be exported to a file. If `export_header` is False, the header will not
1912        be, defaults to True, if output format is not VCF
1913        :type export_header: bool (optional)
1914        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1915        select specific data from the VCF file before exporting it. If provided, only the data that
1916        matches the query will be exported
1917        :type query: str
1918        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1919        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1920        organize data in a hierarchical directory structure based on the values of one or more columns.
1921        This can improve query performance when working with large datasets
1922        :type parquet_partitions: list
1923        :param chunk_size: The `chunk_size` parameter specifies the number of
1924        records in batch when exporting data in Parquet format. This parameter is used for
1925        partitioning the Parquet file into multiple files.
1926        :type chunk_size: int
1927        :param threads: The `threads` parameter is an optional parameter that specifies the number of
1928        threads to be used during the export process. It determines the level of parallelism and can
1929        improve the performance of the export operation. If not provided, the function will use the
1930        default number of threads
1931        :type threads: int
1932        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
1933        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
1934        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
1935        False
1936        :type sort: bool (optional)
1937        :param index: The `index` parameter is a boolean flag that determines whether an index should be
1938        created on the output file. If `index` is True, an index will be created. If `index` is False,
1939        no index will be created. The default value is False, defaults to False
1940        :type index: bool (optional)
1941        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
1942        sorting the output file. This parameter is only applicable when exporting data in VCF format
1943        :type order_by: str
1944        :return: a boolean value. It checks if the output file exists and returns True if it does, or
1945        None if it doesn't.
1946        """
1947
1948        # Log
1949        log.info("Exporting...")
1950
1951        # Full path
1952        output_file = full_path(output_file)
1953        output_header = full_path(output_header)
1954
1955        # Config
1956        config = self.get_config()
1957
1958        # Param
1959        param = self.get_param()
1960
1961        # Tmp files to remove
1962        tmp_to_remove = []
1963
1964        # If no output, get it
1965        if not output_file:
1966            output_file = self.get_output()
1967
1968        # If not threads
1969        if not threads:
1970            threads = self.get_threads()
1971
1972        # Auto header name with extension
1973        if export_header or output_header:
1974            if not output_header:
1975                output_header = f"{output_file}.hdr"
1976            # Export header
1977            self.export_header(output_file=output_file)
1978
1979        # Switch off export header if VCF output
1980        output_file_type = get_file_format(output_file)
1981        if output_file_type in ["vcf"]:
1982            export_header = False
1983            tmp_to_remove.append(output_header)
1984
1985        # Chunk size
1986        if not chunk_size:
1987            chunk_size = config.get("chunk_size", None)
1988
1989        # Parquet partition
1990        if not parquet_partitions:
1991            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
1992        if parquet_partitions and isinstance(parquet_partitions, str):
1993            parquet_partitions = parquet_partitions.split(",")
1994
1995        # Order by
1996        if not order_by:
1997            order_by = param.get("export", {}).get("order_by", "")
1998
1999        # Header in output
2000        header_in_output = param.get("export", {}).get("include_header", False)
2001
2002        # Database
2003        database_source = self.get_connexion()
2004
2005        # Connexion format
2006        connexion_format = self.get_connexion_format()
2007
2008        # Explode infos
2009        if self.get_explode_infos():
2010            self.explode_infos(
2011                prefix=self.get_explode_infos_prefix(),
2012                fields=self.get_explode_infos_fields(),
2013                force=False,
2014            )
2015
2016        # if connexion_format in ["sqlite"] or query:
2017        if connexion_format in ["sqlite"]:
2018
2019            # Export in Parquet
2020            random_tmp = "".join(
2021                random.choice(string.ascii_lowercase) for i in range(10)
2022            )
2023            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2024            tmp_to_remove.append(database_source)
2025
2026            # Table Variants
2027            table_variants = self.get_table_variants()
2028
2029            # Create export query
2030            sql_query_export_subquery = f"""
2031                SELECT * FROM {table_variants}
2032                """
2033
2034            # Write source file
2035            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2036
2037        # Create database
2038        database = Database(
2039            database=database_source,
2040            table="variants",
2041            header_file=output_header,
2042            conn_config=self.get_connexion_config(),
2043        )
2044
2045        # Existing colomns header
2046        # existing_columns_header = database.get_header_file_columns(output_header)
2047        existing_columns_header = database.get_header_columns_from_database()
2048
2049        # Export file
2050        database.export(
2051            output_database=output_file,
2052            output_header=output_header,
2053            existing_columns_header=existing_columns_header,
2054            parquet_partitions=parquet_partitions,
2055            chunk_size=chunk_size,
2056            threads=threads,
2057            sort=sort,
2058            index=index,
2059            header_in_output=header_in_output,
2060            order_by=order_by,
2061            query=query,
2062            export_header=export_header,
2063        )
2064
2065        # Remove
2066        remove_if_exists(tmp_to_remove)
2067
2068        return (os.path.exists(output_file) or None) and (
2069            os.path.exists(output_file) or None
2070        )
2071
2072    def get_extra_infos(self, table: str = None) -> list:
2073        """
2074        > This function returns a list of columns that are in the table but not in the header
2075
2076        The function is called `get_extra_infos` and it takes two arguments: `self` and `table`. The
2077        `self` argument is a reference to the object that called the function. The `table` argument is
2078        the name of the table that we want to get the extra columns from
2079
2080        :param table: The table to get the extra columns from. If not specified, it will use the
2081        variants table
2082        :param format: The format of the output. If it's "sql", it will return a string of the extra
2083        columns separated by commas. If it's "list", it will return a list of the extra columns
2084        :return: A list of columns that are in the table but not in the header
2085        """
2086
2087        header_columns = []
2088
2089        if not table:
2090            table = self.get_table_variants(clause="from")
2091            header_columns = self.get_header_columns()
2092
2093        # Check all columns in the database
2094        query = f""" SELECT * FROM {table} LIMIT 1 """
2095        log.debug(f"query {query}")
2096        table_columns = self.get_query_to_df(query).columns.tolist()
2097        extra_columns = []
2098
2099        # Construct extra infos (not in header)
2100        for column in table_columns:
2101            if column not in header_columns:
2102                extra_columns.append(column)
2103
2104        return extra_columns
2105
2106    def get_extra_infos_sql(self, table: str = None) -> str:
2107        """
2108        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2109        by double quotes
2110
2111        :param table: The name of the table to get the extra infos from. If None, the default table is
2112        used
2113        :type table: str
2114        :return: A string of the extra infos
2115        """
2116
2117        return ", ".join(
2118            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2119        )
2120
2121    def export_header(
2122        self,
2123        header_name: str = None,
2124        output_file: str = None,
2125        output_file_ext: str = ".hdr",
2126        clean_header: bool = True,
2127        remove_chrom_line: bool = False,
2128    ) -> str:
2129        """
2130        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2131        specified options, and writes it to a new file.
2132
2133        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2134        this parameter is not specified, the header will be written to the output file
2135        :type header_name: str
2136        :param output_file: The `output_file` parameter in the `export_header` function is used to
2137        specify the name of the output file where the header will be written. If this parameter is not
2138        provided, the header will be written to a temporary file
2139        :type output_file: str
2140        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2141        string that represents the extension of the output header file. By default, it is set to ".hdr"
2142        if not specified by the user. This extension will be appended to the `output_file` name to
2143        create the final, defaults to .hdr
2144        :type output_file_ext: str (optional)
2145        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2146        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2147        `True`, the function will clean the header by modifying certain lines based on a specific
2148        pattern. If `clean_header`, defaults to True
2149        :type clean_header: bool (optional)
2150        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2151        boolean flag that determines whether the #CHROM line should be removed from the header before
2152        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2153        defaults to False
2154        :type remove_chrom_line: bool (optional)
2155        :return: The function `export_header` returns the name of the temporary header file that is
2156        created.
2157        """
2158
2159        if not header_name and not output_file:
2160            output_file = self.get_output()
2161
2162        if self.get_header():
2163
2164            # Get header object
2165            header_obj = self.get_header()
2166
2167            # Create database
2168            db_for_header = Database(database=self.get_input())
2169
2170            # Get real columns in the file
2171            db_header_columns = db_for_header.get_columns()
2172
2173            with tempfile.TemporaryDirectory() as tmpdir:
2174
2175                # Write header file
2176                header_file_tmp = os.path.join(tmpdir, "header")
2177                f = open(header_file_tmp, "w")
2178                vcf.Writer(f, header_obj)
2179                f.close()
2180
2181                # Replace #CHROM line with rel columns
2182                header_list = db_for_header.read_header_file(
2183                    header_file=header_file_tmp
2184                )
2185                header_list[-1] = "\t".join(db_header_columns)
2186
2187                # Remove CHROM line
2188                if remove_chrom_line:
2189                    header_list.pop()
2190
2191                # Clean header
2192                if clean_header:
2193                    header_list_clean = []
2194                    for head in header_list:
2195                        # Clean head for malformed header
2196                        head_clean = head
2197                        head_clean = re.subn(
2198                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2199                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2200                            head_clean,
2201                            2,
2202                        )[0]
2203                        # Write header
2204                        header_list_clean.append(head_clean)
2205                    header_list = header_list_clean
2206
2207            tmp_header_name = output_file + output_file_ext
2208
2209            f = open(tmp_header_name, "w")
2210            for line in header_list:
2211                f.write(line)
2212            f.close()
2213
2214        return tmp_header_name
2215
2216    def export_variant_vcf(
2217        self,
2218        vcf_file,
2219        remove_info: bool = False,
2220        add_samples: bool = True,
2221        list_samples: list = [],
2222        index: bool = False,
2223        threads: int | None = None,
2224    ) -> bool | None:
2225        """
2226        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2227        remove INFO field, add samples, and control compression and indexing.
2228
2229        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2230        written to. It is the output file that will contain the filtered VCF data based on the specified
2231        parameters
2232        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2233        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2234        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2235        in, defaults to False
2236        :type remove_info: bool (optional)
2237        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2238        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2239        If set to False, the samples will be removed. The default value is True, defaults to True
2240        :type add_samples: bool (optional)
2241        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2242        in the output VCF file. By default, all samples will be included. If you provide a list of
2243        samples, only those samples will be included in the output file
2244        :type list_samples: list
2245        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2246        determines whether or not to create an index for the output VCF file. If `index` is set to
2247        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2248        :type index: bool (optional)
2249        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2250        number of threads to use for exporting the VCF file. It determines how many parallel threads
2251        will be used during the export process. More threads can potentially speed up the export process
2252        by utilizing multiple cores of the processor. If
2253        :type threads: int | None
2254        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2255        method with various parameters including the output file, query, threads, sort flag, and index
2256        flag. The `export_output` method is responsible for exporting the VCF data based on the
2257        specified parameters and configurations provided in the `export_variant_vcf` function.
2258        """
2259
2260        # Config
2261        config = self.get_config()
2262
2263        # Extract VCF
2264        log.debug("Export VCF...")
2265
2266        # Table variants
2267        table_variants = self.get_table_variants()
2268
2269        # Threads
2270        if not threads:
2271            threads = self.get_threads()
2272
2273        # Info fields
2274        if remove_info:
2275            if not isinstance(remove_info, str):
2276                remove_info = "."
2277            info_field = f"""'{remove_info}' as INFO"""
2278        else:
2279            info_field = "INFO"
2280
2281        # Samples fields
2282        if add_samples:
2283            if not list_samples:
2284                list_samples = self.get_header_sample_list()
2285            if list_samples:
2286                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2287            else:
2288                samples_fields = ""
2289            log.debug(f"samples_fields: {samples_fields}")
2290        else:
2291            samples_fields = ""
2292
2293        # Variants
2294        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2295        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} """
2296
2297        return self.export_output(
2298            output_file=vcf_file,
2299            output_header=None,
2300            export_header=True,
2301            query=sql_query_select,
2302            parquet_partitions=None,
2303            chunk_size=config.get("chunk_size", None),
2304            threads=threads,
2305            sort=True,
2306            index=index,
2307            order_by=None,
2308        )
2309
2310    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2311        """
2312        It takes a list of commands and runs them in parallel using the number of threads specified
2313
2314        :param commands: A list of commands to run
2315        :param threads: The number of threads to use, defaults to 1 (optional)
2316        """
2317
2318        run_parallel_commands(commands, threads)
2319
2320    def get_threads(self, default: int = 1) -> int:
2321        """
2322        This function returns the number of threads to use for a job, with a default value of 1 if not
2323        specified.
2324
2325        :param default: The `default` parameter in the `get_threads` method is used to specify the
2326        default number of threads to use if no specific value is provided. If no value is provided for
2327        the `threads` parameter in the configuration or input parameters, the `default` value will be
2328        used, defaults to 1
2329        :type default: int (optional)
2330        :return: the number of threads to use for the current job.
2331        """
2332
2333        # Config
2334        config = self.get_config()
2335
2336        # Param
2337        param = self.get_param()
2338
2339        # Input threads
2340        input_thread = param.get("threads", config.get("threads", None))
2341
2342        # Check threads
2343        if not input_thread:
2344            threads = default
2345        elif int(input_thread) <= 0:
2346            threads = os.cpu_count()
2347        else:
2348            threads = int(input_thread)
2349        return threads
2350
2351    def get_memory(self, default: str = None) -> str:
2352        """
2353        This function retrieves the memory value from parameters or configuration with a default value
2354        if not found.
2355
2356        :param default: The `get_memory` function takes in a default value as a string parameter. This
2357        default value is used as a fallback in case the `memory` parameter is not provided in the
2358        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2359        the function
2360        :type default: str
2361        :return: The `get_memory` function returns a string value representing the memory parameter. If
2362        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2363        return the default value provided as an argument to the function.
2364        """
2365
2366        # Config
2367        config = self.get_config()
2368
2369        # Param
2370        param = self.get_param()
2371
2372        # Input threads
2373        input_memory = param.get("memory", config.get("memory", None))
2374
2375        # Check threads
2376        if input_memory:
2377            memory = input_memory
2378        else:
2379            memory = default
2380
2381        return memory
2382
2383    def update_from_vcf(self, vcf_file: str) -> None:
2384        """
2385        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2386
2387        :param vcf_file: the path to the VCF file
2388        """
2389
2390        connexion_format = self.get_connexion_format()
2391
2392        if connexion_format in ["duckdb"]:
2393            self.update_from_vcf_duckdb(vcf_file)
2394        elif connexion_format in ["sqlite"]:
2395            self.update_from_vcf_sqlite(vcf_file)
2396
2397    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2398        """
2399        It takes a VCF file and updates the INFO column of the variants table in the database with the
2400        INFO column of the VCF file
2401
2402        :param vcf_file: the path to the VCF file
2403        """
2404
2405        # varaints table
2406        table_variants = self.get_table_variants()
2407
2408        # Loading VCF into temporaire table
2409        skip = self.get_header_length(file=vcf_file)
2410        vcf_df = pd.read_csv(
2411            vcf_file,
2412            sep="\t",
2413            engine="c",
2414            skiprows=skip,
2415            header=0,
2416            low_memory=False,
2417        )
2418        sql_query_update = f"""
2419        UPDATE {table_variants} as table_variants
2420            SET INFO = concat(
2421                            CASE
2422                                WHEN INFO NOT IN ('', '.')
2423                                THEN INFO
2424                                ELSE ''
2425                            END,
2426                            (
2427                                SELECT 
2428                                    concat(
2429                                        CASE
2430                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2431                                            THEN ';'
2432                                            ELSE ''
2433                                        END
2434                                        ,
2435                                        CASE
2436                                            WHEN table_parquet.INFO NOT IN ('','.')
2437                                            THEN table_parquet.INFO
2438                                            ELSE ''
2439                                        END
2440                                    )
2441                                FROM vcf_df as table_parquet
2442                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2443                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2444                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2445                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2446                                        AND table_parquet.INFO NOT IN ('','.')
2447                            )
2448                        )
2449            ;
2450            """
2451        self.conn.execute(sql_query_update)
2452
2453    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2454        """
2455        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2456        table, then updates the INFO column of the variants table with the INFO column of the temporary
2457        table
2458
2459        :param vcf_file: The path to the VCF file you want to update the database with
2460        """
2461
2462        # Create a temporary table for the VCF
2463        table_vcf = "tmp_vcf"
2464        sql_create = (
2465            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2466        )
2467        self.conn.execute(sql_create)
2468
2469        # Loading VCF into temporaire table
2470        vcf_df = pd.read_csv(
2471            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2472        )
2473        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2474        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2475
2476        # Update table 'variants' with VCF data
2477        # warning: CONCAT as || operator
2478        sql_query_update = f"""
2479            UPDATE variants as table_variants
2480            SET INFO = CASE
2481                            WHEN INFO NOT IN ('', '.')
2482                            THEN INFO
2483                            ELSE ''
2484                        END ||
2485                        (
2486                        SELECT 
2487                            CASE 
2488                                WHEN table_variants.INFO NOT IN ('','.') 
2489                                    AND table_vcf.INFO NOT IN ('','.')  
2490                                THEN ';' 
2491                                ELSE '' 
2492                            END || 
2493                            CASE 
2494                                WHEN table_vcf.INFO NOT IN ('','.') 
2495                                THEN table_vcf.INFO 
2496                                ELSE '' 
2497                            END
2498                        FROM {table_vcf} as table_vcf
2499                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2500                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2501                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2502                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2503                        )
2504        """
2505        self.conn.execute(sql_query_update)
2506
2507        # Drop temporary table
2508        sql_drop = f"DROP TABLE {table_vcf}"
2509        self.conn.execute(sql_drop)
2510
2511    def drop_variants_table(self) -> None:
2512        """
2513        > This function drops the variants table
2514        """
2515
2516        table_variants = self.get_table_variants()
2517        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2518        self.conn.execute(sql_table_variants)
2519
2520    def set_variant_id(
2521        self, variant_id_column: str = "variant_id", force: bool = None
2522    ) -> str:
2523        """
2524        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2525        `#CHROM`, `POS`, `REF`, and `ALT` columns
2526
2527        :param variant_id_column: The name of the column to be created in the variants table, defaults
2528        to variant_id
2529        :type variant_id_column: str (optional)
2530        :param force: If True, the variant_id column will be created even if it already exists
2531        :type force: bool
2532        :return: The name of the column that contains the variant_id
2533        """
2534
2535        # Assembly
2536        assembly = self.get_param().get(
2537            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2538        )
2539
2540        # INFO/Tag prefix
2541        prefix = self.get_explode_infos_prefix()
2542
2543        # Explode INFO/SVTYPE
2544        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2545
2546        # variants table
2547        table_variants = self.get_table_variants()
2548
2549        # variant_id column
2550        if not variant_id_column:
2551            variant_id_column = "variant_id"
2552
2553        # Creta variant_id column
2554        if "variant_id" not in self.get_extra_infos() or force:
2555
2556            # Create column
2557            self.add_column(
2558                table_name=table_variants,
2559                column_name=variant_id_column,
2560                column_type="UBIGINT",
2561                default_value="0",
2562            )
2563
2564            # Update column
2565            self.conn.execute(
2566                f"""
2567                    UPDATE {table_variants}
2568                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2569                """
2570            )
2571
2572        # Remove added columns
2573        for added_column in added_columns:
2574            self.drop_column(column=added_column)
2575
2576        # return variant_id column name
2577        return variant_id_column
2578
2579    def get_variant_id_column(
2580        self, variant_id_column: str = "variant_id", force: bool = None
2581    ) -> str:
2582        """
2583        This function returns the variant_id column name
2584
2585        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2586        defaults to variant_id
2587        :type variant_id_column: str (optional)
2588        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2589        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2590        if it is not already set, or if it is set
2591        :type force: bool
2592        :return: The variant_id column name.
2593        """
2594
2595        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
2596
2597    ###
2598    # Annotation
2599    ###
2600
2601    def scan_databases(
2602        self, database_formats: list["parquet"], database_releases: list = ["current"]
2603    ) -> dict:
2604        """
2605        The function `scan_databases` scans for available databases based on specified formats and
2606        releases.
2607
2608        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2609        of the databases to be scanned. In this case, the accepted format is "parquet"
2610        :type database_formats: list ["parquet"]
2611        :param database_releases: The `database_releases` parameter is a list that specifies the
2612        releases of the databases to be scanned. In the provided function, the default value for
2613        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2614        databases that are in the "current"
2615        :type database_releases: list
2616        :return: The function `scan_databases` returns a dictionary containing information about
2617        databases that match the specified formats and releases.
2618        """
2619
2620        # Config
2621        config = self.get_config()
2622
2623        # Param
2624        param = self.get_param()
2625
2626        # Param - Assembly
2627        assembly = param.get("assembly", config.get("assembly", None))
2628        if not assembly:
2629            assembly = DEFAULT_ASSEMBLY
2630            log.warning(f"Default assembly '{assembly}'")
2631
2632        # Scan for availabled databases
2633        log.info(
2634            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2635        )
2636        databases_infos_dict = databases_infos(
2637            database_folder_releases=database_releases,
2638            database_formats=database_formats,
2639            assembly=assembly,
2640            config=config,
2641        )
2642        log.info(
2643            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2644        )
2645
2646        return databases_infos_dict
2647
2648    def annotation(self) -> None:
2649        """
2650        It annotates the VCF file with the annotations specified in the config file.
2651        """
2652
2653        # Config
2654        config = self.get_config()
2655
2656        # Param
2657        param = self.get_param()
2658
2659        # Param - Assembly
2660        assembly = param.get("assembly", config.get("assembly", None))
2661        if not assembly:
2662            assembly = DEFAULT_ASSEMBLY
2663            log.warning(f"Default assembly '{assembly}'")
2664
2665        # annotations databases folders
2666        annotations_databases = set(
2667            config.get("folders", {})
2668            .get("databases", {})
2669            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2670            + config.get("folders", {})
2671            .get("databases", {})
2672            .get("parquet", ["~/howard/databases/parquet/current"])
2673            + config.get("folders", {})
2674            .get("databases", {})
2675            .get("bcftools", ["~/howard/databases/bcftools/current"])
2676        )
2677
2678        # Get param annotations
2679        if param.get("annotations", None) and isinstance(
2680            param.get("annotations", None), str
2681        ):
2682            log.debug(param.get("annotations", None))
2683            param_annotation_list = param.get("annotations").split(",")
2684        else:
2685            param_annotation_list = []
2686
2687        # Each tools param
2688        if param.get("annotation_parquet", None) != None:
2689            log.debug(
2690                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2691            )
2692            if isinstance(param.get("annotation_parquet", None), list):
2693                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2694            else:
2695                param_annotation_list.append(param.get("annotation_parquet"))
2696        if param.get("annotation_snpsift", None) != None:
2697            if isinstance(param.get("annotation_snpsift", None), list):
2698                param_annotation_list.append(
2699                    "snpsift:"
2700                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2701                )
2702            else:
2703                param_annotation_list.append(
2704                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2705                )
2706        if param.get("annotation_snpeff", None) != None:
2707            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2708        if param.get("annotation_bcftools", None) != None:
2709            if isinstance(param.get("annotation_bcftools", None), list):
2710                param_annotation_list.append(
2711                    "bcftools:"
2712                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2713                )
2714            else:
2715                param_annotation_list.append(
2716                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2717                )
2718        if param.get("annotation_annovar", None) != None:
2719            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2720        if param.get("annotation_exomiser", None) != None:
2721            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2722        if param.get("annotation_splice", None) != None:
2723            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2724
2725        # Merge param annotations list
2726        param["annotations"] = ",".join(param_annotation_list)
2727
2728        # debug
2729        log.debug(f"param_annotations={param['annotations']}")
2730
2731        if param.get("annotations"):
2732
2733            # Log
2734            # log.info("Annotations - Check annotation parameters")
2735
2736            if not "annotation" in param:
2737                param["annotation"] = {}
2738
2739            # List of annotations parameters
2740            annotations_list_input = {}
2741            if isinstance(param.get("annotations", None), str):
2742                annotation_file_list = [
2743                    value for value in param.get("annotations", "").split(",")
2744                ]
2745                for annotation_file in annotation_file_list:
2746                    annotations_list_input[annotation_file] = {"INFO": None}
2747            else:
2748                annotations_list_input = param.get("annotations", {})
2749
2750            log.info(f"Quick Annotations:")
2751            for annotation_key in list(annotations_list_input.keys()):
2752                log.info(f"   {annotation_key}")
2753
2754            # List of annotations and associated fields
2755            annotations_list = {}
2756
2757            for annotation_file in annotations_list_input:
2758
2759                # Explode annotations if ALL
2760                if (
2761                    annotation_file.upper() == "ALL"
2762                    or annotation_file.upper().startswith("ALL:")
2763                ):
2764
2765                    # check ALL parameters (formats, releases)
2766                    annotation_file_split = annotation_file.split(":")
2767                    database_formats = "parquet"
2768                    database_releases = "current"
2769                    for annotation_file_option in annotation_file_split[1:]:
2770                        database_all_options_split = annotation_file_option.split("=")
2771                        if database_all_options_split[0] == "format":
2772                            database_formats = database_all_options_split[1].split("+")
2773                        if database_all_options_split[0] == "release":
2774                            database_releases = database_all_options_split[1].split("+")
2775
2776                    # Scan for availabled databases
2777                    databases_infos_dict = self.scan_databases(
2778                        database_formats=database_formats,
2779                        database_releases=database_releases,
2780                    )
2781
2782                    # Add found databases in annotation parameters
2783                    for database_infos in databases_infos_dict.keys():
2784                        annotations_list[database_infos] = {"INFO": None}
2785
2786                else:
2787                    annotations_list[annotation_file] = annotations_list_input[
2788                        annotation_file
2789                    ]
2790
2791            # Check each databases
2792            if len(annotations_list):
2793
2794                log.info(
2795                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2796                )
2797
2798                for annotation_file in annotations_list:
2799
2800                    # Init
2801                    annotations = annotations_list.get(annotation_file, None)
2802
2803                    # Annotation snpEff
2804                    if annotation_file.startswith("snpeff"):
2805
2806                        log.debug(f"Quick Annotation snpEff")
2807
2808                        if "snpeff" not in param["annotation"]:
2809                            param["annotation"]["snpeff"] = {}
2810
2811                        if "options" not in param["annotation"]["snpeff"]:
2812                            param["annotation"]["snpeff"]["options"] = ""
2813
2814                        # snpEff options in annotations
2815                        param["annotation"]["snpeff"]["options"] = "".join(
2816                            annotation_file.split(":")[1:]
2817                        )
2818
2819                    # Annotation Annovar
2820                    elif annotation_file.startswith("annovar"):
2821
2822                        log.debug(f"Quick Annotation Annovar")
2823
2824                        if "annovar" not in param["annotation"]:
2825                            param["annotation"]["annovar"] = {}
2826
2827                        if "annotations" not in param["annotation"]["annovar"]:
2828                            param["annotation"]["annovar"]["annotations"] = {}
2829
2830                        # Options
2831                        annotation_file_split = annotation_file.split(":")
2832                        for annotation_file_annotation in annotation_file_split[1:]:
2833                            if annotation_file_annotation:
2834                                param["annotation"]["annovar"]["annotations"][
2835                                    annotation_file_annotation
2836                                ] = annotations
2837
2838                    # Annotation Exomiser
2839                    elif annotation_file.startswith("exomiser"):
2840
2841                        log.debug(f"Quick Annotation Exomiser")
2842
2843                        param["annotation"]["exomiser"] = params_string_to_dict(
2844                            annotation_file
2845                        )
2846
2847                    # Annotation Splice
2848                    elif annotation_file.startswith("splice"):
2849
2850                        log.debug(f"Quick Annotation Splice")
2851
2852                        param["annotation"]["splice"] = params_string_to_dict(
2853                            annotation_file
2854                        )
2855
2856                    # Annotation Parquet or BCFTOOLS
2857                    else:
2858
2859                        # Tools detection
2860                        if annotation_file.startswith("bcftools:"):
2861                            annotation_tool_initial = "bcftools"
2862                            annotation_file = ":".join(annotation_file.split(":")[1:])
2863                        elif annotation_file.startswith("snpsift:"):
2864                            annotation_tool_initial = "snpsift"
2865                            annotation_file = ":".join(annotation_file.split(":")[1:])
2866                        else:
2867                            annotation_tool_initial = None
2868
2869                        # list of files
2870                        annotation_file_list = annotation_file.replace("+", ":").split(
2871                            ":"
2872                        )
2873
2874                        for annotation_file in annotation_file_list:
2875
2876                            if annotation_file:
2877
2878                                # Annotation tool initial
2879                                annotation_tool = annotation_tool_initial
2880
2881                                # Find file
2882                                annotation_file_found = None
2883
2884                                # Expand user
2885                                annotation_file = full_path(annotation_file)
2886
2887                                if os.path.exists(annotation_file):
2888                                    annotation_file_found = annotation_file
2889
2890                                else:
2891                                    # Find within assembly folders
2892                                    for annotations_database in annotations_databases:
2893                                        found_files = find_all(
2894                                            annotation_file,
2895                                            os.path.join(
2896                                                annotations_database, assembly
2897                                            ),
2898                                        )
2899                                        if len(found_files) > 0:
2900                                            annotation_file_found = found_files[0]
2901                                            break
2902                                    if not annotation_file_found and not assembly:
2903                                        # Find within folders
2904                                        for (
2905                                            annotations_database
2906                                        ) in annotations_databases:
2907                                            found_files = find_all(
2908                                                annotation_file, annotations_database
2909                                            )
2910                                            if len(found_files) > 0:
2911                                                annotation_file_found = found_files[0]
2912                                                break
2913                                log.debug(
2914                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2915                                )
2916
2917                                # Full path
2918                                annotation_file_found = full_path(annotation_file_found)
2919
2920                                if annotation_file_found:
2921
2922                                    database = Database(database=annotation_file_found)
2923                                    quick_annotation_format = database.get_format()
2924                                    quick_annotation_is_compressed = (
2925                                        database.is_compressed()
2926                                    )
2927                                    quick_annotation_is_indexed = os.path.exists(
2928                                        f"{annotation_file_found}.tbi"
2929                                    )
2930                                    bcftools_preference = False
2931
2932                                    # Check Annotation Tool
2933                                    if not annotation_tool:
2934                                        if (
2935                                            bcftools_preference
2936                                            and quick_annotation_format
2937                                            in ["vcf", "bed"]
2938                                            and quick_annotation_is_compressed
2939                                            and quick_annotation_is_indexed
2940                                        ):
2941                                            annotation_tool = "bcftools"
2942                                        elif quick_annotation_format in [
2943                                            "vcf",
2944                                            "bed",
2945                                            "tsv",
2946                                            "tsv",
2947                                            "csv",
2948                                            "json",
2949                                            "tbl",
2950                                            "parquet",
2951                                            "duckdb",
2952                                        ]:
2953                                            annotation_tool = "parquet"
2954                                        else:
2955                                            log.error(
2956                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
2957                                            )
2958                                            raise ValueError(
2959                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
2960                                            )
2961
2962                                    log.debug(
2963                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
2964                                    )
2965
2966                                    # Annotation Tool dispatch
2967                                    if annotation_tool:
2968                                        if annotation_tool not in param["annotation"]:
2969                                            param["annotation"][annotation_tool] = {}
2970                                        if (
2971                                            "annotations"
2972                                            not in param["annotation"][annotation_tool]
2973                                        ):
2974                                            param["annotation"][annotation_tool][
2975                                                "annotations"
2976                                            ] = {}
2977                                        param["annotation"][annotation_tool][
2978                                            "annotations"
2979                                        ][annotation_file_found] = annotations
2980
2981                                else:
2982                                    log.error(
2983                                        f"Quick Annotation File {annotation_file} does NOT exist"
2984                                    )
2985
2986                self.set_param(param)
2987
2988        if param.get("annotation", None):
2989            log.info("Annotations")
2990            if param.get("annotation", {}).get("parquet", None):
2991                log.info("Annotations 'parquet'...")
2992                self.annotation_parquet()
2993            if param.get("annotation", {}).get("bcftools", None):
2994                log.info("Annotations 'bcftools'...")
2995                self.annotation_bcftools()
2996            if param.get("annotation", {}).get("snpsift", None):
2997                log.info("Annotations 'snpsift'...")
2998                self.annotation_snpsift()
2999            if param.get("annotation", {}).get("annovar", None):
3000                log.info("Annotations 'annovar'...")
3001                self.annotation_annovar()
3002            if param.get("annotation", {}).get("snpeff", None):
3003                log.info("Annotations 'snpeff'...")
3004                self.annotation_snpeff()
3005            if param.get("annotation", {}).get("exomiser", None) is not None:
3006                log.info("Annotations 'exomiser'...")
3007                self.annotation_exomiser()
3008            if param.get("annotation", {}).get("splice", None) is not None:
3009                log.info("Annotations 'splice' ...")
3010                self.annotation_splice()
3011
3012        # Explode INFOS fields into table fields
3013        if self.get_explode_infos():
3014            self.explode_infos(
3015                prefix=self.get_explode_infos_prefix(),
3016                fields=self.get_explode_infos_fields(),
3017                force=True,
3018            )
3019
3020    def annotation_snpsift(self, threads: int = None) -> None:
3021        """
3022        This function annotate with bcftools
3023
3024        :param threads: Number of threads to use
3025        :return: the value of the variable "return_value".
3026        """
3027
3028        # DEBUG
3029        log.debug("Start annotation with bcftools databases")
3030
3031        # Threads
3032        if not threads:
3033            threads = self.get_threads()
3034        log.debug("Threads: " + str(threads))
3035
3036        # Config
3037        config = self.get_config()
3038        log.debug("Config: " + str(config))
3039
3040        # Config - snpSift
3041        snpsift_bin_command = get_bin_command(
3042            bin="SnpSift.jar",
3043            tool="snpsift",
3044            bin_type="jar",
3045            config=config,
3046            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3047        )
3048        if not snpsift_bin_command:
3049            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3050            log.error(msg_err)
3051            raise ValueError(msg_err)
3052
3053        # Config - bcftools
3054        bcftools_bin_command = get_bin_command(
3055            bin="bcftools",
3056            tool="bcftools",
3057            bin_type="bin",
3058            config=config,
3059            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3060        )
3061        if not bcftools_bin_command:
3062            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3063            log.error(msg_err)
3064            raise ValueError(msg_err)
3065
3066        # Config - BCFTools databases folders
3067        databases_folders = set(
3068            self.get_config()
3069            .get("folders", {})
3070            .get("databases", {})
3071            .get("annotations", ["."])
3072            + self.get_config()
3073            .get("folders", {})
3074            .get("databases", {})
3075            .get("bcftools", ["."])
3076        )
3077        log.debug("Databases annotations: " + str(databases_folders))
3078
3079        # Param
3080        annotations = (
3081            self.get_param()
3082            .get("annotation", {})
3083            .get("snpsift", {})
3084            .get("annotations", None)
3085        )
3086        log.debug("Annotations: " + str(annotations))
3087
3088        # Assembly
3089        assembly = self.get_param().get(
3090            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3091        )
3092
3093        # Data
3094        table_variants = self.get_table_variants()
3095
3096        # Check if not empty
3097        log.debug("Check if not empty")
3098        sql_query_chromosomes = (
3099            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3100        )
3101        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3102        if not sql_query_chromosomes_df["count"][0]:
3103            log.info(f"VCF empty")
3104            return
3105
3106        # VCF header
3107        vcf_reader = self.get_header()
3108        log.debug("Initial header: " + str(vcf_reader.infos))
3109
3110        # Existing annotations
3111        for vcf_annotation in self.get_header().infos:
3112
3113            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3114            log.debug(
3115                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3116            )
3117
3118        if annotations:
3119
3120            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3121
3122                # Export VCF file
3123                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3124
3125                # Init
3126                commands = {}
3127
3128                for annotation in annotations:
3129                    annotation_fields = annotations[annotation]
3130
3131                    # Annotation Name
3132                    annotation_name = os.path.basename(annotation)
3133
3134                    if not annotation_fields:
3135                        annotation_fields = {"INFO": None}
3136
3137                    log.debug(f"Annotation '{annotation_name}'")
3138                    log.debug(
3139                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3140                    )
3141
3142                    # Create Database
3143                    database = Database(
3144                        database=annotation,
3145                        databases_folders=databases_folders,
3146                        assembly=assembly,
3147                    )
3148
3149                    # Find files
3150                    db_file = database.get_database()
3151                    db_file = full_path(db_file)
3152                    db_hdr_file = database.get_header_file()
3153                    db_hdr_file = full_path(db_hdr_file)
3154                    db_file_type = database.get_format()
3155                    db_tbi_file = f"{db_file}.tbi"
3156                    db_file_compressed = database.is_compressed()
3157
3158                    # Check if compressed
3159                    if not db_file_compressed:
3160                        log.error(
3161                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3162                        )
3163                        raise ValueError(
3164                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3165                        )
3166
3167                    # Check if indexed
3168                    if not os.path.exists(db_tbi_file):
3169                        log.error(
3170                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3171                        )
3172                        raise ValueError(
3173                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3174                        )
3175
3176                    # Check index - try to create if not exists
3177                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3178                        log.error("Annotation failed: database not valid")
3179                        log.error(f"Annotation annotation file: {db_file}")
3180                        log.error(f"Annotation annotation header: {db_hdr_file}")
3181                        log.error(f"Annotation annotation index: {db_tbi_file}")
3182                        raise ValueError(
3183                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3184                        )
3185                    else:
3186
3187                        log.debug(
3188                            f"Annotation '{annotation}' - file: "
3189                            + str(db_file)
3190                            + " and "
3191                            + str(db_hdr_file)
3192                        )
3193
3194                        # Load header as VCF object
3195                        db_hdr_vcf = Variants(input=db_hdr_file)
3196                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3197                        log.debug(
3198                            "Annotation database header: "
3199                            + str(db_hdr_vcf_header_infos)
3200                        )
3201
3202                        # For all fields in database
3203                        annotation_fields_full = False
3204                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3205                            annotation_fields = {
3206                                key: key for key in db_hdr_vcf_header_infos
3207                            }
3208                            log.debug(
3209                                "Annotation database header - All annotations added: "
3210                                + str(annotation_fields)
3211                            )
3212                            annotation_fields_full = True
3213
3214                        # # Create file for field rename
3215                        # log.debug("Create file for field rename")
3216                        # tmp_rename = NamedTemporaryFile(
3217                        #     prefix=self.get_prefix(),
3218                        #     dir=self.get_tmp_dir(),
3219                        #     suffix=".rename",
3220                        #     delete=False,
3221                        # )
3222                        # tmp_rename_name = tmp_rename.name
3223                        # tmp_files.append(tmp_rename_name)
3224
3225                        # Number of fields
3226                        nb_annotation_field = 0
3227                        annotation_list = []
3228                        annotation_infos_rename_list = []
3229
3230                        for annotation_field in annotation_fields:
3231
3232                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3233                            annotation_fields_new_name = annotation_fields.get(
3234                                annotation_field, annotation_field
3235                            )
3236                            if not annotation_fields_new_name:
3237                                annotation_fields_new_name = annotation_field
3238
3239                            # Check if field is in DB and if field is not elready in input data
3240                            if (
3241                                annotation_field in db_hdr_vcf.get_header().infos
3242                                and annotation_fields_new_name
3243                                not in self.get_header().infos
3244                            ):
3245
3246                                log.info(
3247                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3248                                )
3249
3250                                # BCFTools annotate param to rename fields
3251                                if annotation_field != annotation_fields_new_name:
3252                                    annotation_infos_rename_list.append(
3253                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3254                                    )
3255
3256                                # Add INFO field to header
3257                                db_hdr_vcf_header_infos_number = (
3258                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3259                                )
3260                                db_hdr_vcf_header_infos_type = (
3261                                    db_hdr_vcf_header_infos[annotation_field].type
3262                                    or "String"
3263                                )
3264                                db_hdr_vcf_header_infos_description = (
3265                                    db_hdr_vcf_header_infos[annotation_field].desc
3266                                    or f"{annotation_field} description"
3267                                )
3268                                db_hdr_vcf_header_infos_source = (
3269                                    db_hdr_vcf_header_infos[annotation_field].source
3270                                    or "unknown"
3271                                )
3272                                db_hdr_vcf_header_infos_version = (
3273                                    db_hdr_vcf_header_infos[annotation_field].version
3274                                    or "unknown"
3275                                )
3276
3277                                vcf_reader.infos[annotation_fields_new_name] = (
3278                                    vcf.parser._Info(
3279                                        annotation_fields_new_name,
3280                                        db_hdr_vcf_header_infos_number,
3281                                        db_hdr_vcf_header_infos_type,
3282                                        db_hdr_vcf_header_infos_description,
3283                                        db_hdr_vcf_header_infos_source,
3284                                        db_hdr_vcf_header_infos_version,
3285                                        self.code_type_map[
3286                                            db_hdr_vcf_header_infos_type
3287                                        ],
3288                                    )
3289                                )
3290
3291                                annotation_list.append(annotation_field)
3292
3293                                nb_annotation_field += 1
3294
3295                            else:
3296
3297                                if (
3298                                    annotation_field
3299                                    not in db_hdr_vcf.get_header().infos
3300                                ):
3301                                    log.warning(
3302                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3303                                    )
3304                                if (
3305                                    annotation_fields_new_name
3306                                    in self.get_header().infos
3307                                ):
3308                                    log.warning(
3309                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3310                                    )
3311
3312                        log.info(
3313                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3314                        )
3315
3316                        annotation_infos = ",".join(annotation_list)
3317
3318                        if annotation_infos != "":
3319
3320                            # Annotated VCF (and error file)
3321                            tmp_annotation_vcf_name = os.path.join(
3322                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3323                            )
3324                            tmp_annotation_vcf_name_err = (
3325                                tmp_annotation_vcf_name + ".err"
3326                            )
3327
3328                            # Add fields to annotate
3329                            if not annotation_fields_full:
3330                                annotation_infos_option = f"-info {annotation_infos}"
3331                            else:
3332                                annotation_infos_option = ""
3333
3334                            # Info fields rename
3335                            if annotation_infos_rename_list:
3336                                annotation_infos_rename = " -c " + ",".join(
3337                                    annotation_infos_rename_list
3338                                )
3339                            else:
3340                                annotation_infos_rename = ""
3341
3342                            # Annotate command
3343                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3344
3345                            # Add command
3346                            commands[command_annotate] = tmp_annotation_vcf_name
3347
3348                if commands:
3349
3350                    # Export VCF file
3351                    self.export_variant_vcf(
3352                        vcf_file=tmp_vcf_name,
3353                        remove_info=True,
3354                        add_samples=False,
3355                        index=True,
3356                    )
3357                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3358
3359                    # Num command
3360                    nb_command = 0
3361
3362                    # Annotate
3363                    for command_annotate in commands:
3364                        nb_command += 1
3365                        log.info(
3366                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3367                        )
3368                        log.debug(f"command_annotate={command_annotate}")
3369                        run_parallel_commands([command_annotate], threads)
3370
3371                        # Debug
3372                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3373
3374                        # Update variants
3375                        log.info(
3376                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3377                        )
3378                        self.update_from_vcf(commands[command_annotate])
3379
3380    def annotation_bcftools(self, threads: int = None) -> None:
3381        """
3382        This function annotate with bcftools
3383
3384        :param threads: Number of threads to use
3385        :return: the value of the variable "return_value".
3386        """
3387
3388        # DEBUG
3389        log.debug("Start annotation with bcftools databases")
3390
3391        # Threads
3392        if not threads:
3393            threads = self.get_threads()
3394        log.debug("Threads: " + str(threads))
3395
3396        # Config
3397        config = self.get_config()
3398        log.debug("Config: " + str(config))
3399
3400        # DEBUG
3401        delete_tmp = True
3402        if self.get_config().get("verbosity", "warning") in ["debug"]:
3403            delete_tmp = False
3404            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3405
3406        # Config - BCFTools bin command
3407        bcftools_bin_command = get_bin_command(
3408            bin="bcftools",
3409            tool="bcftools",
3410            bin_type="bin",
3411            config=config,
3412            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3413        )
3414        if not bcftools_bin_command:
3415            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3416            log.error(msg_err)
3417            raise ValueError(msg_err)
3418
3419        # Config - BCFTools databases folders
3420        databases_folders = set(
3421            self.get_config()
3422            .get("folders", {})
3423            .get("databases", {})
3424            .get("annotations", ["."])
3425            + self.get_config()
3426            .get("folders", {})
3427            .get("databases", {})
3428            .get("bcftools", ["."])
3429        )
3430        log.debug("Databases annotations: " + str(databases_folders))
3431
3432        # Param
3433        annotations = (
3434            self.get_param()
3435            .get("annotation", {})
3436            .get("bcftools", {})
3437            .get("annotations", None)
3438        )
3439        log.debug("Annotations: " + str(annotations))
3440
3441        # Assembly
3442        assembly = self.get_param().get(
3443            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3444        )
3445
3446        # Data
3447        table_variants = self.get_table_variants()
3448
3449        # Check if not empty
3450        log.debug("Check if not empty")
3451        sql_query_chromosomes = (
3452            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3453        )
3454        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3455        if not sql_query_chromosomes_df["count"][0]:
3456            log.info(f"VCF empty")
3457            return
3458
3459        # Export in VCF
3460        log.debug("Create initial file to annotate")
3461        tmp_vcf = NamedTemporaryFile(
3462            prefix=self.get_prefix(),
3463            dir=self.get_tmp_dir(),
3464            suffix=".vcf.gz",
3465            delete=False,
3466        )
3467        tmp_vcf_name = tmp_vcf.name
3468
3469        # VCF header
3470        vcf_reader = self.get_header()
3471        log.debug("Initial header: " + str(vcf_reader.infos))
3472
3473        # Existing annotations
3474        for vcf_annotation in self.get_header().infos:
3475
3476            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3477            log.debug(
3478                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3479            )
3480
3481        if annotations:
3482
3483            tmp_ann_vcf_list = []
3484            commands = []
3485            tmp_files = []
3486            err_files = []
3487
3488            for annotation in annotations:
3489                annotation_fields = annotations[annotation]
3490
3491                # Annotation Name
3492                annotation_name = os.path.basename(annotation)
3493
3494                if not annotation_fields:
3495                    annotation_fields = {"INFO": None}
3496
3497                log.debug(f"Annotation '{annotation_name}'")
3498                log.debug(
3499                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3500                )
3501
3502                # Create Database
3503                database = Database(
3504                    database=annotation,
3505                    databases_folders=databases_folders,
3506                    assembly=assembly,
3507                )
3508
3509                # Find files
3510                db_file = database.get_database()
3511                db_file = full_path(db_file)
3512                db_hdr_file = database.get_header_file()
3513                db_hdr_file = full_path(db_hdr_file)
3514                db_file_type = database.get_format()
3515                db_tbi_file = f"{db_file}.tbi"
3516                db_file_compressed = database.is_compressed()
3517
3518                # Check if compressed
3519                if not db_file_compressed:
3520                    log.error(
3521                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3522                    )
3523                    raise ValueError(
3524                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3525                    )
3526
3527                # Check if indexed
3528                if not os.path.exists(db_tbi_file):
3529                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3530                    raise ValueError(
3531                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3532                    )
3533
3534                # Check index - try to create if not exists
3535                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3536                    log.error("Annotation failed: database not valid")
3537                    log.error(f"Annotation annotation file: {db_file}")
3538                    log.error(f"Annotation annotation header: {db_hdr_file}")
3539                    log.error(f"Annotation annotation index: {db_tbi_file}")
3540                    raise ValueError(
3541                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3542                    )
3543                else:
3544
3545                    log.debug(
3546                        f"Annotation '{annotation}' - file: "
3547                        + str(db_file)
3548                        + " and "
3549                        + str(db_hdr_file)
3550                    )
3551
3552                    # Load header as VCF object
3553                    db_hdr_vcf = Variants(input=db_hdr_file)
3554                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3555                    log.debug(
3556                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3557                    )
3558
3559                    # For all fields in database
3560                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3561                        annotation_fields = {
3562                            key: key for key in db_hdr_vcf_header_infos
3563                        }
3564                        log.debug(
3565                            "Annotation database header - All annotations added: "
3566                            + str(annotation_fields)
3567                        )
3568
3569                    # Number of fields
3570                    nb_annotation_field = 0
3571                    annotation_list = []
3572
3573                    for annotation_field in annotation_fields:
3574
3575                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3576                        annotation_fields_new_name = annotation_fields.get(
3577                            annotation_field, annotation_field
3578                        )
3579                        if not annotation_fields_new_name:
3580                            annotation_fields_new_name = annotation_field
3581
3582                        # Check if field is in DB and if field is not elready in input data
3583                        if (
3584                            annotation_field in db_hdr_vcf.get_header().infos
3585                            and annotation_fields_new_name
3586                            not in self.get_header().infos
3587                        ):
3588
3589                            log.info(
3590                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3591                            )
3592
3593                            # Add INFO field to header
3594                            db_hdr_vcf_header_infos_number = (
3595                                db_hdr_vcf_header_infos[annotation_field].num or "."
3596                            )
3597                            db_hdr_vcf_header_infos_type = (
3598                                db_hdr_vcf_header_infos[annotation_field].type
3599                                or "String"
3600                            )
3601                            db_hdr_vcf_header_infos_description = (
3602                                db_hdr_vcf_header_infos[annotation_field].desc
3603                                or f"{annotation_field} description"
3604                            )
3605                            db_hdr_vcf_header_infos_source = (
3606                                db_hdr_vcf_header_infos[annotation_field].source
3607                                or "unknown"
3608                            )
3609                            db_hdr_vcf_header_infos_version = (
3610                                db_hdr_vcf_header_infos[annotation_field].version
3611                                or "unknown"
3612                            )
3613
3614                            vcf_reader.infos[annotation_fields_new_name] = (
3615                                vcf.parser._Info(
3616                                    annotation_fields_new_name,
3617                                    db_hdr_vcf_header_infos_number,
3618                                    db_hdr_vcf_header_infos_type,
3619                                    db_hdr_vcf_header_infos_description,
3620                                    db_hdr_vcf_header_infos_source,
3621                                    db_hdr_vcf_header_infos_version,
3622                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3623                                )
3624                            )
3625
3626                            # annotation_list.append(annotation_field)
3627                            if annotation_field != annotation_fields_new_name:
3628                                annotation_list.append(
3629                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3630                                )
3631                            else:
3632                                annotation_list.append(annotation_field)
3633
3634                            nb_annotation_field += 1
3635
3636                        else:
3637
3638                            if annotation_field not in db_hdr_vcf.get_header().infos:
3639                                log.warning(
3640                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3641                                )
3642                            if annotation_fields_new_name in self.get_header().infos:
3643                                log.warning(
3644                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3645                                )
3646
3647                    log.info(
3648                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3649                    )
3650
3651                    annotation_infos = ",".join(annotation_list)
3652
3653                    if annotation_infos != "":
3654
3655                        # Protect header for bcftools (remove "#CHROM" and variants line)
3656                        log.debug("Protect Header file - remove #CHROM line if exists")
3657                        tmp_header_vcf = NamedTemporaryFile(
3658                            prefix=self.get_prefix(),
3659                            dir=self.get_tmp_dir(),
3660                            suffix=".hdr",
3661                            delete=False,
3662                        )
3663                        tmp_header_vcf_name = tmp_header_vcf.name
3664                        tmp_files.append(tmp_header_vcf_name)
3665                        # Command
3666                        if db_hdr_file.endswith(".gz"):
3667                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3668                        else:
3669                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3670                        # Run
3671                        run_parallel_commands([command_extract_header], 1)
3672
3673                        # Find chomosomes
3674                        log.debug("Find chromosomes ")
3675                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3676                        sql_query_chromosomes_df = self.get_query_to_df(
3677                            sql_query_chromosomes
3678                        )
3679                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3680
3681                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3682
3683                        # BED columns in the annotation file
3684                        if db_file_type in ["bed"]:
3685                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3686
3687                        for chrom in chomosomes_list:
3688
3689                            # Create BED on initial VCF
3690                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3691                            tmp_bed = NamedTemporaryFile(
3692                                prefix=self.get_prefix(),
3693                                dir=self.get_tmp_dir(),
3694                                suffix=".bed",
3695                                delete=False,
3696                            )
3697                            tmp_bed_name = tmp_bed.name
3698                            tmp_files.append(tmp_bed_name)
3699
3700                            # Detecte regions
3701                            log.debug(
3702                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3703                            )
3704                            window = 1000000
3705                            sql_query_intervals_for_bed = f"""
3706                                SELECT  \"#CHROM\",
3707                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3708                                        \"POS\"+{window}
3709                                FROM {table_variants} as table_variants
3710                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3711                            """
3712                            regions = self.conn.execute(
3713                                sql_query_intervals_for_bed
3714                            ).fetchall()
3715                            merged_regions = merge_regions(regions)
3716                            log.debug(
3717                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3718                            )
3719
3720                            header = ["#CHROM", "START", "END"]
3721                            with open(tmp_bed_name, "w") as f:
3722                                # Write the header with tab delimiter
3723                                f.write("\t".join(header) + "\n")
3724                                for d in merged_regions:
3725                                    # Write each data row with tab delimiter
3726                                    f.write("\t".join(map(str, d)) + "\n")
3727
3728                            # Tmp files
3729                            tmp_annotation_vcf = NamedTemporaryFile(
3730                                prefix=self.get_prefix(),
3731                                dir=self.get_tmp_dir(),
3732                                suffix=".vcf.gz",
3733                                delete=False,
3734                            )
3735                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3736                            tmp_files.append(tmp_annotation_vcf_name)
3737                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3738                            tmp_annotation_vcf_name_err = (
3739                                tmp_annotation_vcf_name + ".err"
3740                            )
3741                            err_files.append(tmp_annotation_vcf_name_err)
3742
3743                            # Annotate Command
3744                            log.debug(
3745                                f"Annotation '{annotation}' - add bcftools command"
3746                            )
3747
3748                            # Command
3749                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3750
3751                            # Add command
3752                            commands.append(command_annotate)
3753
3754            # if some commands
3755            if commands:
3756
3757                # Export VCF file
3758                self.export_variant_vcf(
3759                    vcf_file=tmp_vcf_name,
3760                    remove_info=True,
3761                    add_samples=False,
3762                    index=True,
3763                )
3764
3765                # Threads
3766                # calculate threads for annotated commands
3767                if commands:
3768                    threads_bcftools_annotate = round(threads / len(commands))
3769                else:
3770                    threads_bcftools_annotate = 1
3771
3772                if not threads_bcftools_annotate:
3773                    threads_bcftools_annotate = 1
3774
3775                # Add threads option to bcftools commands
3776                if threads_bcftools_annotate > 1:
3777                    commands_threaded = []
3778                    for command in commands:
3779                        commands_threaded.append(
3780                            command.replace(
3781                                f"{bcftools_bin_command} annotate ",
3782                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3783                            )
3784                        )
3785                    commands = commands_threaded
3786
3787                # Command annotation multithreading
3788                log.debug(f"Annotation - Annotation commands: " + str(commands))
3789                log.info(
3790                    f"Annotation - Annotation multithreaded in "
3791                    + str(len(commands))
3792                    + " commands"
3793                )
3794
3795                run_parallel_commands(commands, threads)
3796
3797                # Merge
3798                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3799
3800                if tmp_ann_vcf_list_cmd:
3801
3802                    # Tmp file
3803                    tmp_annotate_vcf = NamedTemporaryFile(
3804                        prefix=self.get_prefix(),
3805                        dir=self.get_tmp_dir(),
3806                        suffix=".vcf.gz",
3807                        delete=True,
3808                    )
3809                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3810                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3811                    err_files.append(tmp_annotate_vcf_name_err)
3812
3813                    # Tmp file remove command
3814                    tmp_files_remove_command = ""
3815                    if tmp_files:
3816                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3817
3818                    # Command merge
3819                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3820                    log.info(
3821                        f"Annotation - Annotation merging "
3822                        + str(len(commands))
3823                        + " annotated files"
3824                    )
3825                    log.debug(f"Annotation - merge command: {merge_command}")
3826                    run_parallel_commands([merge_command], 1)
3827
3828                    # Error messages
3829                    log.info(f"Error/Warning messages:")
3830                    error_message_command_all = []
3831                    error_message_command_warning = []
3832                    error_message_command_err = []
3833                    for err_file in err_files:
3834                        with open(err_file, "r") as f:
3835                            for line in f:
3836                                message = line.strip()
3837                                error_message_command_all.append(message)
3838                                if line.startswith("[W::"):
3839                                    error_message_command_warning.append(message)
3840                                if line.startswith("[E::"):
3841                                    error_message_command_err.append(
3842                                        f"{err_file}: " + message
3843                                    )
3844                    # log info
3845                    for message in list(
3846                        set(error_message_command_err + error_message_command_warning)
3847                    ):
3848                        log.info(f"   {message}")
3849                    # debug info
3850                    for message in list(set(error_message_command_all)):
3851                        log.debug(f"   {message}")
3852                    # failed
3853                    if len(error_message_command_err):
3854                        log.error("Annotation failed: Error in commands")
3855                        raise ValueError("Annotation failed: Error in commands")
3856
3857                    # Update variants
3858                    log.info(f"Annotation - Updating...")
3859                    self.update_from_vcf(tmp_annotate_vcf_name)
3860
3861    def annotation_exomiser(self, threads: int = None) -> None:
3862        """
3863        This function annotate with Exomiser
3864
3865        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3866        - "analysis" (dict/file):
3867            Full analysis dictionnary parameters (see Exomiser docs).
3868            Either a dict, or a file in JSON or YAML format.
3869            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3870            Default : None
3871        - "preset" (string):
3872            Analysis preset (available in config folder).
3873            Used if no full "analysis" is provided.
3874            Default: "exome"
3875        - "phenopacket" (dict/file):
3876            Samples and phenotipic features parameters (see Exomiser docs).
3877            Either a dict, or a file in JSON or YAML format.
3878            Default: None
3879        - "subject" (dict):
3880            Sample parameters (see Exomiser docs).
3881            Example:
3882                "subject":
3883                    {
3884                        "id": "ISDBM322017",
3885                        "sex": "FEMALE"
3886                    }
3887            Default: None
3888        - "sample" (string):
3889            Sample name to construct "subject" section:
3890                "subject":
3891                    {
3892                        "id": "<sample>",
3893                        "sex": "UNKNOWN_SEX"
3894                    }
3895            Default: None
3896        - "phenotypicFeatures" (dict)
3897            Phenotypic features to construct "subject" section.
3898            Example:
3899                "phenotypicFeatures":
3900                    [
3901                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3902                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3903                    ]
3904        - "hpo" (list)
3905            List of HPO ids as phenotypic features.
3906            Example:
3907                "hpo": ['0001156', '0001363', '0011304', '0010055']
3908            Default: []
3909        - "outputOptions" (dict):
3910            Output options (see Exomiser docs).
3911            Default:
3912                "output_options" =
3913                    {
3914                        "outputContributingVariantsOnly": False,
3915                        "numGenes": 0,
3916                        "outputFormats": ["TSV_VARIANT", "VCF"]
3917                    }
3918        - "transcript_source" (string):
3919            Transcript source (either "refseq", "ucsc", "ensembl")
3920            Default: "refseq"
3921        - "exomiser_to_info" (boolean):
3922            Add exomiser TSV file columns as INFO fields in VCF.
3923            Default: False
3924        - "release" (string):
3925            Exomise database release.
3926            If not exists, database release will be downloaded (take a while).
3927            Default: None (provided by application.properties configuration file)
3928        - "exomiser_application_properties" (file):
3929            Exomiser configuration file (see Exomiser docs).
3930            Useful to automatically download databases (especially for specific genome databases).
3931
3932        Notes:
3933        - If no sample in parameters, first sample in VCF will be chosen
3934        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
3935
3936        :param threads: The number of threads to use
3937        :return: None.
3938        """
3939
3940        # DEBUG
3941        log.debug("Start annotation with Exomiser databases")
3942
3943        # Threads
3944        if not threads:
3945            threads = self.get_threads()
3946        log.debug("Threads: " + str(threads))
3947
3948        # Config
3949        config = self.get_config()
3950        log.debug("Config: " + str(config))
3951
3952        # Config - Folders - Databases
3953        databases_folders = (
3954            config.get("folders", {})
3955            .get("databases", {})
3956            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
3957        )
3958        databases_folders = full_path(databases_folders)
3959        if not os.path.exists(databases_folders):
3960            log.error(f"Databases annotations: {databases_folders} NOT found")
3961        log.debug("Databases annotations: " + str(databases_folders))
3962
3963        # Config - Exomiser
3964        exomiser_bin_command = get_bin_command(
3965            bin="exomiser-cli*.jar",
3966            tool="exomiser",
3967            bin_type="jar",
3968            config=config,
3969            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
3970        )
3971        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
3972        if not exomiser_bin_command:
3973            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
3974            log.error(msg_err)
3975            raise ValueError(msg_err)
3976
3977        # Param
3978        param = self.get_param()
3979        log.debug("Param: " + str(param))
3980
3981        # Param - Exomiser
3982        param_exomiser = param.get("annotation", {}).get("exomiser", {})
3983        log.debug(f"Param Exomiser: {param_exomiser}")
3984
3985        # Param - Assembly
3986        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
3987        log.debug("Assembly: " + str(assembly))
3988
3989        # Data
3990        table_variants = self.get_table_variants()
3991
3992        # Check if not empty
3993        log.debug("Check if not empty")
3994        sql_query_chromosomes = (
3995            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3996        )
3997        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
3998            log.info(f"VCF empty")
3999            return False
4000
4001        # VCF header
4002        vcf_reader = self.get_header()
4003        log.debug("Initial header: " + str(vcf_reader.infos))
4004
4005        # Samples
4006        samples = self.get_header_sample_list()
4007        if not samples:
4008            log.error("No Samples in VCF")
4009            return False
4010        log.debug(f"Samples: {samples}")
4011
4012        # Memory limit
4013        memory_limit = self.get_memory("8G")
4014        log.debug(f"memory_limit: {memory_limit}")
4015
4016        # Exomiser java options
4017        exomiser_java_options = (
4018            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4019        )
4020        log.debug(f"Exomiser java options: {exomiser_java_options}")
4021
4022        # Download Exomiser (if not exists)
4023        exomiser_release = param_exomiser.get("release", None)
4024        exomiser_application_properties = param_exomiser.get(
4025            "exomiser_application_properties", None
4026        )
4027        databases_download_exomiser(
4028            assemblies=[assembly],
4029            exomiser_folder=databases_folders,
4030            exomiser_release=exomiser_release,
4031            exomiser_phenotype_release=exomiser_release,
4032            exomiser_application_properties=exomiser_application_properties,
4033        )
4034
4035        # Force annotation
4036        force_update_annotation = True
4037
4038        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4039            log.debug("Start annotation Exomiser")
4040
4041            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4042
4043                # tmp_dir = "/tmp/exomiser"
4044
4045                ### ANALYSIS ###
4046                ################
4047
4048                # Create analysis.json through analysis dict
4049                # either analysis in param or by default
4050                # depending on preset exome/genome)
4051
4052                # Init analysis dict
4053                param_exomiser_analysis_dict = {}
4054
4055                # analysis from param
4056                param_exomiser_analysis = param_exomiser.get("analysis", {})
4057                param_exomiser_analysis = full_path(param_exomiser_analysis)
4058
4059                # If analysis in param -> load anlaysis json
4060                if param_exomiser_analysis:
4061
4062                    # If param analysis is a file and exists
4063                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4064                        param_exomiser_analysis
4065                    ):
4066                        # Load analysis file into analysis dict (either yaml or json)
4067                        with open(param_exomiser_analysis) as json_file:
4068                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4069
4070                    # If param analysis is a dict
4071                    elif isinstance(param_exomiser_analysis, dict):
4072                        # Load analysis dict into analysis dict (either yaml or json)
4073                        param_exomiser_analysis_dict = param_exomiser_analysis
4074
4075                    # Error analysis type
4076                    else:
4077                        log.error(f"Analysis type unknown. Check param file.")
4078                        raise ValueError(f"Analysis type unknown. Check param file.")
4079
4080                # Case no input analysis config file/dict
4081                # Use preset (exome/genome) to open default config file
4082                if not param_exomiser_analysis_dict:
4083
4084                    # default preset
4085                    default_preset = "exome"
4086
4087                    # Get param preset or default preset
4088                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4089
4090                    # Try to find if preset is a file
4091                    if os.path.exists(param_exomiser_preset):
4092                        # Preset file is provided in full path
4093                        param_exomiser_analysis_default_config_file = (
4094                            param_exomiser_preset
4095                        )
4096                    # elif os.path.exists(full_path(param_exomiser_preset)):
4097                    #     # Preset file is provided in full path
4098                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4099                    elif os.path.exists(
4100                        os.path.join(folder_config, param_exomiser_preset)
4101                    ):
4102                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4103                        param_exomiser_analysis_default_config_file = os.path.join(
4104                            folder_config, param_exomiser_preset
4105                        )
4106                    else:
4107                        # Construct preset file
4108                        param_exomiser_analysis_default_config_file = os.path.join(
4109                            folder_config,
4110                            f"preset-{param_exomiser_preset}-analysis.json",
4111                        )
4112
4113                    # If preset file exists
4114                    param_exomiser_analysis_default_config_file = full_path(
4115                        param_exomiser_analysis_default_config_file
4116                    )
4117                    if os.path.exists(param_exomiser_analysis_default_config_file):
4118                        # Load prest file into analysis dict (either yaml or json)
4119                        with open(
4120                            param_exomiser_analysis_default_config_file
4121                        ) as json_file:
4122                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4123                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4124                                json_file
4125                            )
4126
4127                    # Error preset file
4128                    else:
4129                        log.error(
4130                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4131                        )
4132                        raise ValueError(
4133                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4134                        )
4135
4136                # If no analysis dict created
4137                if not param_exomiser_analysis_dict:
4138                    log.error(f"No analysis config")
4139                    raise ValueError(f"No analysis config")
4140
4141                # Log
4142                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4143
4144                ### PHENOPACKET ###
4145                ###################
4146
4147                # If no PhenoPacket in analysis dict -> check in param
4148                if "phenopacket" not in param_exomiser_analysis_dict:
4149
4150                    # If PhenoPacket in param -> load anlaysis json
4151                    if param_exomiser.get("phenopacket", None):
4152
4153                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4154                        param_exomiser_phenopacket = full_path(
4155                            param_exomiser_phenopacket
4156                        )
4157
4158                        # If param phenopacket is a file and exists
4159                        if isinstance(
4160                            param_exomiser_phenopacket, str
4161                        ) and os.path.exists(param_exomiser_phenopacket):
4162                            # Load phenopacket file into analysis dict (either yaml or json)
4163                            with open(param_exomiser_phenopacket) as json_file:
4164                                param_exomiser_analysis_dict["phenopacket"] = (
4165                                    yaml.safe_load(json_file)
4166                                )
4167
4168                        # If param phenopacket is a dict
4169                        elif isinstance(param_exomiser_phenopacket, dict):
4170                            # Load phenopacket dict into analysis dict (either yaml or json)
4171                            param_exomiser_analysis_dict["phenopacket"] = (
4172                                param_exomiser_phenopacket
4173                            )
4174
4175                        # Error phenopacket type
4176                        else:
4177                            log.error(f"Phenopacket type unknown. Check param file.")
4178                            raise ValueError(
4179                                f"Phenopacket type unknown. Check param file."
4180                            )
4181
4182                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4183                if "phenopacket" not in param_exomiser_analysis_dict:
4184
4185                    # Init PhenoPacket
4186                    param_exomiser_analysis_dict["phenopacket"] = {
4187                        "id": "analysis",
4188                        "proband": {},
4189                    }
4190
4191                    ### Add subject ###
4192
4193                    # If subject exists
4194                    param_exomiser_subject = param_exomiser.get("subject", {})
4195
4196                    # If subject not exists -> found sample ID
4197                    if not param_exomiser_subject:
4198
4199                        # Found sample ID in param
4200                        sample = param_exomiser.get("sample", None)
4201
4202                        # Find sample ID (first sample)
4203                        if not sample:
4204                            sample_list = self.get_header_sample_list()
4205                            if len(sample_list) > 0:
4206                                sample = sample_list[0]
4207                            else:
4208                                log.error(f"No sample found")
4209                                raise ValueError(f"No sample found")
4210
4211                        # Create subject
4212                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4213
4214                    # Add to dict
4215                    param_exomiser_analysis_dict["phenopacket"][
4216                        "subject"
4217                    ] = param_exomiser_subject
4218
4219                    ### Add "phenotypicFeatures" ###
4220
4221                    # If phenotypicFeatures exists
4222                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4223                        "phenotypicFeatures", []
4224                    )
4225
4226                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4227                    if not param_exomiser_phenotypicfeatures:
4228
4229                        # Found HPO in param
4230                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4231
4232                        # Split HPO if list in string format separated by comma
4233                        if isinstance(param_exomiser_hpo, str):
4234                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4235
4236                        # Create HPO list
4237                        for hpo in param_exomiser_hpo:
4238                            hpo_clean = re.sub("[^0-9]", "", hpo)
4239                            param_exomiser_phenotypicfeatures.append(
4240                                {
4241                                    "type": {
4242                                        "id": f"HP:{hpo_clean}",
4243                                        "label": f"HP:{hpo_clean}",
4244                                    }
4245                                }
4246                            )
4247
4248                    # Add to dict
4249                    param_exomiser_analysis_dict["phenopacket"][
4250                        "phenotypicFeatures"
4251                    ] = param_exomiser_phenotypicfeatures
4252
4253                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4254                    if not param_exomiser_phenotypicfeatures:
4255                        for step in param_exomiser_analysis_dict.get(
4256                            "analysis", {}
4257                        ).get("steps", []):
4258                            if "hiPhivePrioritiser" in step:
4259                                param_exomiser_analysis_dict.get("analysis", {}).get(
4260                                    "steps", []
4261                                ).remove(step)
4262
4263                ### Add Input File ###
4264
4265                # Initial file name and htsFiles
4266                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4267                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4268                    {
4269                        "uri": tmp_vcf_name,
4270                        "htsFormat": "VCF",
4271                        "genomeAssembly": assembly,
4272                    }
4273                ]
4274
4275                ### Add metaData ###
4276
4277                # If metaData not in analysis dict
4278                if "metaData" not in param_exomiser_analysis_dict:
4279                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4280                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4281                        "createdBy": "howard",
4282                        "phenopacketSchemaVersion": 1,
4283                    }
4284
4285                ### OutputOptions ###
4286
4287                # Init output result folder
4288                output_results = os.path.join(tmp_dir, "results")
4289
4290                # If no outputOptions in analysis dict
4291                if "outputOptions" not in param_exomiser_analysis_dict:
4292
4293                    # default output formats
4294                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4295
4296                    # Get outputOptions in param
4297                    output_options = param_exomiser.get("outputOptions", None)
4298
4299                    # If no output_options in param -> check
4300                    if not output_options:
4301                        output_options = {
4302                            "outputContributingVariantsOnly": False,
4303                            "numGenes": 0,
4304                            "outputFormats": defaut_output_formats,
4305                        }
4306
4307                    # Replace outputDirectory in output options
4308                    output_options["outputDirectory"] = output_results
4309                    output_options["outputFileName"] = "howard"
4310
4311                    # Add outputOptions in analysis dict
4312                    param_exomiser_analysis_dict["outputOptions"] = output_options
4313
4314                else:
4315
4316                    # Replace output_results and output format (if exists in param)
4317                    param_exomiser_analysis_dict["outputOptions"][
4318                        "outputDirectory"
4319                    ] = output_results
4320                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4321                        list(
4322                            set(
4323                                param_exomiser_analysis_dict.get(
4324                                    "outputOptions", {}
4325                                ).get("outputFormats", [])
4326                                + ["TSV_VARIANT", "VCF"]
4327                            )
4328                        )
4329                    )
4330
4331                # log
4332                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4333
4334                ### ANALYSIS FILE ###
4335                #####################
4336
4337                ### Full JSON analysis config file ###
4338
4339                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4340                with open(exomiser_analysis, "w") as fp:
4341                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4342
4343                ### SPLIT analysis and sample config files
4344
4345                # Splitted analysis dict
4346                param_exomiser_analysis_dict_for_split = (
4347                    param_exomiser_analysis_dict.copy()
4348                )
4349
4350                # Phenopacket JSON file
4351                exomiser_analysis_phenopacket = os.path.join(
4352                    tmp_dir, "analysis_phenopacket.json"
4353                )
4354                with open(exomiser_analysis_phenopacket, "w") as fp:
4355                    json.dump(
4356                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4357                        fp,
4358                        indent=4,
4359                    )
4360
4361                # Analysis JSON file without Phenopacket parameters
4362                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4363                exomiser_analysis_analysis = os.path.join(
4364                    tmp_dir, "analysis_analysis.json"
4365                )
4366                with open(exomiser_analysis_analysis, "w") as fp:
4367                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4368
4369                ### INITAL VCF file ###
4370                #######################
4371
4372                ### Create list of samples to use and include inti initial VCF file ####
4373
4374                # Subject (main sample)
4375                # Get sample ID in analysis dict
4376                sample_subject = (
4377                    param_exomiser_analysis_dict.get("phenopacket", {})
4378                    .get("subject", {})
4379                    .get("id", None)
4380                )
4381                sample_proband = (
4382                    param_exomiser_analysis_dict.get("phenopacket", {})
4383                    .get("proband", {})
4384                    .get("subject", {})
4385                    .get("id", None)
4386                )
4387                sample = []
4388                if sample_subject:
4389                    sample.append(sample_subject)
4390                if sample_proband:
4391                    sample.append(sample_proband)
4392
4393                # Get sample ID within Pedigree
4394                pedigree_persons_list = (
4395                    param_exomiser_analysis_dict.get("phenopacket", {})
4396                    .get("pedigree", {})
4397                    .get("persons", {})
4398                )
4399
4400                # Create list with all sample ID in pedigree (if exists)
4401                pedigree_persons = []
4402                for person in pedigree_persons_list:
4403                    pedigree_persons.append(person.get("individualId"))
4404
4405                # Concat subject sample ID and samples ID in pedigreesamples
4406                samples = list(set(sample + pedigree_persons))
4407
4408                # Check if sample list is not empty
4409                if not samples:
4410                    log.error(f"No samples found")
4411                    raise ValueError(f"No samples found")
4412
4413                # Create VCF with sample (either sample in param or first one by default)
4414                # Export VCF file
4415                self.export_variant_vcf(
4416                    vcf_file=tmp_vcf_name,
4417                    remove_info=True,
4418                    add_samples=True,
4419                    list_samples=samples,
4420                    index=False,
4421                )
4422
4423                ### Execute Exomiser ###
4424                ########################
4425
4426                # Init command
4427                exomiser_command = ""
4428
4429                # Command exomiser options
4430                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4431
4432                # Release
4433                exomiser_release = param_exomiser.get("release", None)
4434                if exomiser_release:
4435                    # phenotype data version
4436                    exomiser_options += (
4437                        f" --exomiser.phenotype.data-version={exomiser_release} "
4438                    )
4439                    # data version
4440                    exomiser_options += (
4441                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4442                    )
4443                    # variant white list
4444                    variant_white_list_file = (
4445                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4446                    )
4447                    if os.path.exists(
4448                        os.path.join(
4449                            databases_folders, assembly, variant_white_list_file
4450                        )
4451                    ):
4452                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4453
4454                # transcript_source
4455                transcript_source = param_exomiser.get(
4456                    "transcript_source", None
4457                )  # ucsc, refseq, ensembl
4458                if transcript_source:
4459                    exomiser_options += (
4460                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4461                    )
4462
4463                # If analysis contain proband param
4464                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4465                    "proband", {}
4466                ):
4467                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4468
4469                # If no proband (usually uniq sample)
4470                else:
4471                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4472
4473                # Log
4474                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4475
4476                # Run command
4477                result = subprocess.call(
4478                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4479                )
4480                if result:
4481                    log.error("Exomiser command failed")
4482                    raise ValueError("Exomiser command failed")
4483
4484                ### RESULTS ###
4485                ###############
4486
4487                ### Annotate with TSV fields ###
4488
4489                # Init result tsv file
4490                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4491
4492                # Init result tsv file
4493                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4494
4495                # Parse TSV file and explode columns in INFO field
4496                if exomiser_to_info and os.path.exists(output_results_tsv):
4497
4498                    # Log
4499                    log.debug("Exomiser columns to VCF INFO field")
4500
4501                    # Retrieve columns and types
4502                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4503                    output_results_tsv_df = self.get_query_to_df(query)
4504                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4505
4506                    # Init concat fields for update
4507                    sql_query_update_concat_fields = []
4508
4509                    # Fields to avoid
4510                    fields_to_avoid = [
4511                        "CONTIG",
4512                        "START",
4513                        "END",
4514                        "REF",
4515                        "ALT",
4516                        "QUAL",
4517                        "FILTER",
4518                        "GENOTYPE",
4519                    ]
4520
4521                    # List all columns to add into header
4522                    for header_column in output_results_tsv_columns:
4523
4524                        # If header column is enable
4525                        if header_column not in fields_to_avoid:
4526
4527                            # Header info type
4528                            header_info_type = "String"
4529                            header_column_df = output_results_tsv_df[header_column]
4530                            header_column_df_dtype = header_column_df.dtype
4531                            if header_column_df_dtype == object:
4532                                if (
4533                                    pd.to_numeric(header_column_df, errors="coerce")
4534                                    .notnull()
4535                                    .all()
4536                                ):
4537                                    header_info_type = "Float"
4538                            else:
4539                                header_info_type = "Integer"
4540
4541                            # Header info
4542                            characters_to_validate = ["-"]
4543                            pattern = "[" + "".join(characters_to_validate) + "]"
4544                            header_info_name = re.sub(
4545                                pattern,
4546                                "_",
4547                                f"Exomiser_{header_column}".replace("#", ""),
4548                            )
4549                            header_info_number = "."
4550                            header_info_description = (
4551                                f"Exomiser {header_column} annotation"
4552                            )
4553                            header_info_source = "Exomiser"
4554                            header_info_version = "unknown"
4555                            header_info_code = CODE_TYPE_MAP[header_info_type]
4556                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4557                                header_info_name,
4558                                header_info_number,
4559                                header_info_type,
4560                                header_info_description,
4561                                header_info_source,
4562                                header_info_version,
4563                                header_info_code,
4564                            )
4565
4566                            # Add field to add for update to concat fields
4567                            sql_query_update_concat_fields.append(
4568                                f"""
4569                                CASE
4570                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4571                                    THEN concat(
4572                                        '{header_info_name}=',
4573                                        table_parquet."{header_column}",
4574                                        ';'
4575                                        )
4576
4577                                    ELSE ''
4578                                END
4579                            """
4580                            )
4581
4582                    # Update query
4583                    sql_query_update = f"""
4584                        UPDATE {table_variants} as table_variants
4585                            SET INFO = concat(
4586                                            CASE
4587                                                WHEN INFO NOT IN ('', '.')
4588                                                THEN INFO
4589                                                ELSE ''
4590                                            END,
4591                                            CASE
4592                                                WHEN table_variants.INFO NOT IN ('','.')
4593                                                THEN ';'
4594                                                ELSE ''
4595                                            END,
4596                                            (
4597                                            SELECT 
4598                                                concat(
4599                                                    {",".join(sql_query_update_concat_fields)}
4600                                                )
4601                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4602                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4603                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4604                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4605                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4606                                            )
4607                                        )
4608                            ;
4609                        """
4610
4611                    # Update
4612                    self.conn.execute(sql_query_update)
4613
4614                ### Annotate with VCF INFO field ###
4615
4616                # Init result VCF file
4617                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4618
4619                # If VCF exists
4620                if os.path.exists(output_results_vcf):
4621
4622                    # Log
4623                    log.debug("Exomiser result VCF update variants")
4624
4625                    # Find Exomiser INFO field annotation in header
4626                    with gzip.open(output_results_vcf, "rt") as f:
4627                        header_list = self.read_vcf_header(f)
4628                    exomiser_vcf_header = vcf.Reader(
4629                        io.StringIO("\n".join(header_list))
4630                    )
4631
4632                    # Add annotation INFO field to header
4633                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4634
4635                    # Update variants with VCF
4636                    self.update_from_vcf(output_results_vcf)
4637
4638        return True
4639
4640    def annotation_snpeff(self, threads: int = None) -> None:
4641        """
4642        This function annotate with snpEff
4643
4644        :param threads: The number of threads to use
4645        :return: the value of the variable "return_value".
4646        """
4647
4648        # DEBUG
4649        log.debug("Start annotation with snpeff databases")
4650
4651        # Threads
4652        if not threads:
4653            threads = self.get_threads()
4654        log.debug("Threads: " + str(threads))
4655
4656        # DEBUG
4657        delete_tmp = True
4658        if self.get_config().get("verbosity", "warning") in ["debug"]:
4659            delete_tmp = False
4660            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4661
4662        # Config
4663        config = self.get_config()
4664        log.debug("Config: " + str(config))
4665
4666        # Config - Folders - Databases
4667        databases_folders = (
4668            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4669        )
4670        log.debug("Databases annotations: " + str(databases_folders))
4671
4672        # # Config - Java
4673        # java_bin = get_bin(
4674        #     tool="java",
4675        #     bin="java",
4676        #     bin_type="bin",
4677        #     config=config,
4678        #     default_folder="/usr/bin",
4679        # )
4680        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4681        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4682        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4683
4684        # # Config - snpEff bin
4685        # snpeff_jar = get_bin(
4686        #     tool="snpeff",
4687        #     bin="snpEff.jar",
4688        #     bin_type="jar",
4689        #     config=config,
4690        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4691        # )
4692        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4693        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4694        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4695
4696        # Config - snpEff bin command
4697        snpeff_bin_command = get_bin_command(
4698            bin="snpEff.jar",
4699            tool="snpeff",
4700            bin_type="jar",
4701            config=config,
4702            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4703        )
4704        if not snpeff_bin_command:
4705            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4706            log.error(msg_err)
4707            raise ValueError(msg_err)
4708
4709        # Config - snpEff databases
4710        snpeff_databases = (
4711            config.get("folders", {})
4712            .get("databases", {})
4713            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4714        )
4715        snpeff_databases = full_path(snpeff_databases)
4716        if snpeff_databases is not None and snpeff_databases != "":
4717            log.debug(f"Create snpEff databases folder")
4718            if not os.path.exists(snpeff_databases):
4719                os.makedirs(snpeff_databases)
4720
4721        # Param
4722        param = self.get_param()
4723        log.debug("Param: " + str(param))
4724
4725        # Param
4726        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4727        log.debug("Options: " + str(options))
4728
4729        # Param - Assembly
4730        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4731
4732        # Param - Options
4733        snpeff_options = (
4734            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4735        )
4736        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4737        snpeff_csvstats = (
4738            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4739        )
4740        if snpeff_stats:
4741            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4742            snpeff_stats = full_path(snpeff_stats)
4743            snpeff_options += f" -stats {snpeff_stats}"
4744        if snpeff_csvstats:
4745            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4746            snpeff_csvstats = full_path(snpeff_csvstats)
4747            snpeff_options += f" -csvStats {snpeff_csvstats}"
4748
4749        # Data
4750        table_variants = self.get_table_variants()
4751
4752        # Check if not empty
4753        log.debug("Check if not empty")
4754        sql_query_chromosomes = (
4755            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4756        )
4757        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4758        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4759            log.info(f"VCF empty")
4760            return
4761
4762        # Export in VCF
4763        log.debug("Create initial file to annotate")
4764        tmp_vcf = NamedTemporaryFile(
4765            prefix=self.get_prefix(),
4766            dir=self.get_tmp_dir(),
4767            suffix=".vcf.gz",
4768            delete=True,
4769        )
4770        tmp_vcf_name = tmp_vcf.name
4771
4772        # VCF header
4773        vcf_reader = self.get_header()
4774        log.debug("Initial header: " + str(vcf_reader.infos))
4775
4776        # Existing annotations
4777        for vcf_annotation in self.get_header().infos:
4778
4779            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4780            log.debug(
4781                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4782            )
4783
4784        # Memory limit
4785        # if config.get("memory", None):
4786        #     memory_limit = config.get("memory", "8G")
4787        # else:
4788        #     memory_limit = "8G"
4789        memory_limit = self.get_memory("8G")
4790        log.debug(f"memory_limit: {memory_limit}")
4791
4792        # snpEff java options
4793        snpeff_java_options = (
4794            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4795        )
4796        log.debug(f"Exomiser java options: {snpeff_java_options}")
4797
4798        force_update_annotation = True
4799
4800        if "ANN" not in self.get_header().infos or force_update_annotation:
4801
4802            # Check snpEff database
4803            log.debug(f"Check snpEff databases {[assembly]}")
4804            databases_download_snpeff(
4805                folder=snpeff_databases, assemblies=[assembly], config=config
4806            )
4807
4808            # Export VCF file
4809            self.export_variant_vcf(
4810                vcf_file=tmp_vcf_name,
4811                remove_info=True,
4812                add_samples=False,
4813                index=True,
4814            )
4815
4816            # Tmp file
4817            err_files = []
4818            tmp_annotate_vcf = NamedTemporaryFile(
4819                prefix=self.get_prefix(),
4820                dir=self.get_tmp_dir(),
4821                suffix=".vcf",
4822                delete=False,
4823            )
4824            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4825            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4826            err_files.append(tmp_annotate_vcf_name_err)
4827
4828            # Command
4829            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4830            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4831            run_parallel_commands([snpeff_command], 1)
4832
4833            # Error messages
4834            log.info(f"Error/Warning messages:")
4835            error_message_command_all = []
4836            error_message_command_warning = []
4837            error_message_command_err = []
4838            for err_file in err_files:
4839                with open(err_file, "r") as f:
4840                    for line in f:
4841                        message = line.strip()
4842                        error_message_command_all.append(message)
4843                        if line.startswith("[W::"):
4844                            error_message_command_warning.append(message)
4845                        if line.startswith("[E::"):
4846                            error_message_command_err.append(f"{err_file}: " + message)
4847            # log info
4848            for message in list(
4849                set(error_message_command_err + error_message_command_warning)
4850            ):
4851                log.info(f"   {message}")
4852            # debug info
4853            for message in list(set(error_message_command_all)):
4854                log.debug(f"   {message}")
4855            # failed
4856            if len(error_message_command_err):
4857                log.error("Annotation failed: Error in commands")
4858                raise ValueError("Annotation failed: Error in commands")
4859
4860            # Find annotation in header
4861            with open(tmp_annotate_vcf_name, "rt") as f:
4862                header_list = self.read_vcf_header(f)
4863            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4864
4865            for ann in annovar_vcf_header.infos:
4866                if ann not in self.get_header().infos:
4867                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4868
4869            # Update variants
4870            log.info(f"Annotation - Updating...")
4871            self.update_from_vcf(tmp_annotate_vcf_name)
4872
4873        else:
4874            if "ANN" in self.get_header().infos:
4875                log.debug(f"Existing snpEff annotations in VCF")
4876            if force_update_annotation:
4877                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
4878
4879    def annotation_annovar(self, threads: int = None) -> None:
4880        """
4881        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4882        annotations
4883
4884        :param threads: number of threads to use
4885        :return: the value of the variable "return_value".
4886        """
4887
4888        # DEBUG
4889        log.debug("Start annotation with Annovar databases")
4890
4891        # Threads
4892        if not threads:
4893            threads = self.get_threads()
4894        log.debug("Threads: " + str(threads))
4895
4896        # Tmp en Err files
4897        tmp_files = []
4898        err_files = []
4899
4900        # DEBUG
4901        delete_tmp = True
4902        if self.get_config().get("verbosity", "warning") in ["debug"]:
4903            delete_tmp = False
4904            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4905
4906        # Config
4907        config = self.get_config()
4908        log.debug("Config: " + str(config))
4909
4910        # Config - Folders - Databases
4911        databases_folders = (
4912            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4913        )
4914        log.debug("Databases annotations: " + str(databases_folders))
4915
4916        # Config - annovar bin command
4917        annovar_bin_command = get_bin_command(
4918            bin="table_annovar.pl",
4919            tool="annovar",
4920            bin_type="perl",
4921            config=config,
4922            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
4923        )
4924        if not annovar_bin_command:
4925            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
4926            log.error(msg_err)
4927            raise ValueError(msg_err)
4928
4929        # Config - BCFTools bin command
4930        bcftools_bin_command = get_bin_command(
4931            bin="bcftools",
4932            tool="bcftools",
4933            bin_type="bin",
4934            config=config,
4935            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
4936        )
4937        if not bcftools_bin_command:
4938            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
4939            log.error(msg_err)
4940            raise ValueError(msg_err)
4941
4942        # Config - annovar databases
4943        annovar_databases = (
4944            config.get("folders", {})
4945            .get("databases", {})
4946            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
4947        )
4948        annovar_databases = full_path(annovar_databases)
4949        if annovar_databases != "" and not os.path.exists(annovar_databases):
4950            os.makedirs(annovar_databases)
4951
4952        # Param
4953        param = self.get_param()
4954        log.debug("Param: " + str(param))
4955
4956        # Param - options
4957        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
4958        log.debug("Options: " + str(options))
4959
4960        # Param - annotations
4961        annotations = (
4962            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
4963        )
4964        log.debug("Annotations: " + str(annotations))
4965
4966        # Param - Assembly
4967        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4968
4969        # Annovar database assembly
4970        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
4971        if annovar_databases_assembly != "" and not os.path.exists(
4972            annovar_databases_assembly
4973        ):
4974            os.makedirs(annovar_databases_assembly)
4975
4976        # Data
4977        table_variants = self.get_table_variants()
4978
4979        # Check if not empty
4980        log.debug("Check if not empty")
4981        sql_query_chromosomes = (
4982            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4983        )
4984        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
4985        if not sql_query_chromosomes_df["count"][0]:
4986            log.info(f"VCF empty")
4987            return
4988
4989        # VCF header
4990        vcf_reader = self.get_header()
4991        log.debug("Initial header: " + str(vcf_reader.infos))
4992
4993        # Existing annotations
4994        for vcf_annotation in self.get_header().infos:
4995
4996            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4997            log.debug(
4998                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4999            )
5000
5001        force_update_annotation = True
5002
5003        if annotations:
5004
5005            commands = []
5006            tmp_annotates_vcf_name_list = []
5007
5008            # Export in VCF
5009            log.debug("Create initial file to annotate")
5010            tmp_vcf = NamedTemporaryFile(
5011                prefix=self.get_prefix(),
5012                dir=self.get_tmp_dir(),
5013                suffix=".vcf.gz",
5014                delete=False,
5015            )
5016            tmp_vcf_name = tmp_vcf.name
5017            tmp_files.append(tmp_vcf_name)
5018            tmp_files.append(tmp_vcf_name + ".tbi")
5019
5020            # Export VCF file
5021            self.export_variant_vcf(
5022                vcf_file=tmp_vcf_name,
5023                remove_info=".",
5024                add_samples=False,
5025                index=True,
5026            )
5027
5028            # Create file for field rename
5029            log.debug("Create file for field rename")
5030            tmp_rename = NamedTemporaryFile(
5031                prefix=self.get_prefix(),
5032                dir=self.get_tmp_dir(),
5033                suffix=".rename",
5034                delete=False,
5035            )
5036            tmp_rename_name = tmp_rename.name
5037            tmp_files.append(tmp_rename_name)
5038
5039            # Check Annovar database
5040            log.debug(
5041                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5042            )
5043            databases_download_annovar(
5044                folder=annovar_databases,
5045                files=list(annotations.keys()),
5046                assemblies=[assembly],
5047            )
5048
5049            for annotation in annotations:
5050                annotation_fields = annotations[annotation]
5051
5052                if not annotation_fields:
5053                    annotation_fields = {"INFO": None}
5054
5055                log.info(f"Annotations Annovar - database '{annotation}'")
5056                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5057
5058                # Tmp file for annovar
5059                err_files = []
5060                tmp_annotate_vcf_directory = TemporaryDirectory(
5061                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5062                )
5063                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5064                tmp_annotate_vcf_name_annovar = (
5065                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5066                )
5067                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5068                err_files.append(tmp_annotate_vcf_name_err)
5069                tmp_files.append(tmp_annotate_vcf_name_err)
5070
5071                # Tmp file final vcf annotated by annovar
5072                tmp_annotate_vcf = NamedTemporaryFile(
5073                    prefix=self.get_prefix(),
5074                    dir=self.get_tmp_dir(),
5075                    suffix=".vcf.gz",
5076                    delete=False,
5077                )
5078                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5079                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5080                tmp_files.append(tmp_annotate_vcf_name)
5081                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5082
5083                # Number of fields
5084                annotation_list = []
5085                annotation_renamed_list = []
5086
5087                for annotation_field in annotation_fields:
5088
5089                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5090                    annotation_fields_new_name = annotation_fields.get(
5091                        annotation_field, annotation_field
5092                    )
5093                    if not annotation_fields_new_name:
5094                        annotation_fields_new_name = annotation_field
5095
5096                    if (
5097                        force_update_annotation
5098                        or annotation_fields_new_name not in self.get_header().infos
5099                    ):
5100                        annotation_list.append(annotation_field)
5101                        annotation_renamed_list.append(annotation_fields_new_name)
5102                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5103                        log.warning(
5104                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5105                        )
5106
5107                    # Add rename info
5108                    run_parallel_commands(
5109                        [
5110                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5111                        ],
5112                        1,
5113                    )
5114
5115                # log.debug("fields_to_removed: " + str(fields_to_removed))
5116                log.debug("annotation_list: " + str(annotation_list))
5117
5118                # protocol
5119                protocol = annotation
5120
5121                # argument
5122                argument = ""
5123
5124                # operation
5125                operation = "f"
5126                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5127                    "ensGene"
5128                ):
5129                    operation = "g"
5130                    if options.get("genebase", None):
5131                        argument = f"""'{options.get("genebase","")}'"""
5132                elif annotation in ["cytoBand"]:
5133                    operation = "r"
5134
5135                # argument option
5136                argument_option = ""
5137                if argument != "":
5138                    argument_option = " --argument " + argument
5139
5140                # command options
5141                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5142                for option in options:
5143                    if option not in ["genebase"]:
5144                        command_options += f""" --{option}={options[option]}"""
5145
5146                # Command
5147
5148                # Command - Annovar
5149                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5150                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5151
5152                # Command - start pipe
5153                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5154
5155                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5156                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5157
5158                # Command - Special characters (refGene annotation)
5159                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5160
5161                # Command - Clean empty fields (with value ".")
5162                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5163
5164                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5165                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5166                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5167                    # for ann in annotation_renamed_list:
5168                    for ann in annotation_list:
5169                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5170
5171                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5172
5173                # Command - indexing
5174                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5175
5176                log.debug(f"Annotation - Annovar command: {command_annovar}")
5177                run_parallel_commands([command_annovar], 1)
5178
5179                # Error messages
5180                log.info(f"Error/Warning messages:")
5181                error_message_command_all = []
5182                error_message_command_warning = []
5183                error_message_command_err = []
5184                for err_file in err_files:
5185                    with open(err_file, "r") as f:
5186                        for line in f:
5187                            message = line.strip()
5188                            error_message_command_all.append(message)
5189                            if line.startswith("[W::") or line.startswith("WARNING"):
5190                                error_message_command_warning.append(message)
5191                            if line.startswith("[E::") or line.startswith("ERROR"):
5192                                error_message_command_err.append(
5193                                    f"{err_file}: " + message
5194                                )
5195                # log info
5196                for message in list(
5197                    set(error_message_command_err + error_message_command_warning)
5198                ):
5199                    log.info(f"   {message}")
5200                # debug info
5201                for message in list(set(error_message_command_all)):
5202                    log.debug(f"   {message}")
5203                # failed
5204                if len(error_message_command_err):
5205                    log.error("Annotation failed: Error in commands")
5206                    raise ValueError("Annotation failed: Error in commands")
5207
5208            if tmp_annotates_vcf_name_list:
5209
5210                # List of annotated files
5211                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5212
5213                # Tmp file
5214                tmp_annotate_vcf = NamedTemporaryFile(
5215                    prefix=self.get_prefix(),
5216                    dir=self.get_tmp_dir(),
5217                    suffix=".vcf.gz",
5218                    delete=False,
5219                )
5220                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5221                tmp_files.append(tmp_annotate_vcf_name)
5222                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5223                err_files.append(tmp_annotate_vcf_name_err)
5224                tmp_files.append(tmp_annotate_vcf_name_err)
5225
5226                # Command merge
5227                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5228                log.info(
5229                    f"Annotation Annovar - Annotation merging "
5230                    + str(len(tmp_annotates_vcf_name_list))
5231                    + " annotated files"
5232                )
5233                log.debug(f"Annotation - merge command: {merge_command}")
5234                run_parallel_commands([merge_command], 1)
5235
5236                # Find annotation in header
5237                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5238                    header_list = self.read_vcf_header(f)
5239                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5240
5241                for ann in annovar_vcf_header.infos:
5242                    if ann not in self.get_header().infos:
5243                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5244
5245                # Update variants
5246                log.info(f"Annotation Annovar - Updating...")
5247                self.update_from_vcf(tmp_annotate_vcf_name)
5248
5249            # Clean files
5250            # Tmp file remove command
5251            if True:
5252                tmp_files_remove_command = ""
5253                if tmp_files:
5254                    tmp_files_remove_command = " ".join(tmp_files)
5255                clean_command = f" rm -f {tmp_files_remove_command} "
5256                log.debug(f"Annotation Annovar - Annotation cleaning ")
5257                log.debug(f"Annotation - cleaning command: {clean_command}")
5258                run_parallel_commands([clean_command], 1)
5259
5260    # Parquet
5261    def annotation_parquet(self, threads: int = None) -> None:
5262        """
5263        It takes a VCF file, and annotates it with a parquet file
5264
5265        :param threads: number of threads to use for the annotation
5266        :return: the value of the variable "result".
5267        """
5268
5269        # DEBUG
5270        log.debug("Start annotation with parquet databases")
5271
5272        # Threads
5273        if not threads:
5274            threads = self.get_threads()
5275        log.debug("Threads: " + str(threads))
5276
5277        # DEBUG
5278        delete_tmp = True
5279        if self.get_config().get("verbosity", "warning") in ["debug"]:
5280            delete_tmp = False
5281            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5282
5283        # Config
5284        databases_folders = set(
5285            self.get_config()
5286            .get("folders", {})
5287            .get("databases", {})
5288            .get("annotations", ["."])
5289            + self.get_config()
5290            .get("folders", {})
5291            .get("databases", {})
5292            .get("parquet", ["."])
5293        )
5294        log.debug("Databases annotations: " + str(databases_folders))
5295
5296        # Param
5297        annotations = (
5298            self.get_param()
5299            .get("annotation", {})
5300            .get("parquet", {})
5301            .get("annotations", None)
5302        )
5303        log.debug("Annotations: " + str(annotations))
5304
5305        # Assembly
5306        assembly = self.get_param().get(
5307            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5308        )
5309
5310        # Force Update Annotation
5311        force_update_annotation = (
5312            self.get_param()
5313            .get("annotation", {})
5314            .get("options", {})
5315            .get("annotations_update", False)
5316        )
5317        log.debug(f"force_update_annotation={force_update_annotation}")
5318        force_append_annotation = (
5319            self.get_param()
5320            .get("annotation", {})
5321            .get("options", {})
5322            .get("annotations_append", False)
5323        )
5324        log.debug(f"force_append_annotation={force_append_annotation}")
5325
5326        # Data
5327        table_variants = self.get_table_variants()
5328
5329        # Check if not empty
5330        log.debug("Check if not empty")
5331        sql_query_chromosomes_df = self.get_query_to_df(
5332            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5333        )
5334        if not sql_query_chromosomes_df["count"][0]:
5335            log.info(f"VCF empty")
5336            return
5337
5338        # VCF header
5339        vcf_reader = self.get_header()
5340        log.debug("Initial header: " + str(vcf_reader.infos))
5341
5342        # Nb Variants POS
5343        log.debug("NB Variants Start")
5344        nb_variants = self.conn.execute(
5345            f"SELECT count(*) AS count FROM variants"
5346        ).fetchdf()["count"][0]
5347        log.debug("NB Variants Stop")
5348
5349        # Existing annotations
5350        for vcf_annotation in self.get_header().infos:
5351
5352            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5353            log.debug(
5354                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5355            )
5356
5357        # Added columns
5358        added_columns = []
5359
5360        # drop indexes
5361        log.debug(f"Drop indexes...")
5362        self.drop_indexes()
5363
5364        if annotations:
5365
5366            if "ALL" in annotations:
5367
5368                all_param = annotations.get("ALL", {})
5369                all_param_formats = all_param.get("formats", None)
5370                all_param_releases = all_param.get("releases", None)
5371
5372                databases_infos_dict = self.scan_databases(
5373                    database_formats=all_param_formats,
5374                    database_releases=all_param_releases,
5375                )
5376                for database_infos in databases_infos_dict.keys():
5377                    if database_infos not in annotations:
5378                        annotations[database_infos] = {"INFO": None}
5379
5380            for annotation in annotations:
5381
5382                if annotation in ["ALL"]:
5383                    continue
5384
5385                # Annotation Name
5386                annotation_name = os.path.basename(annotation)
5387
5388                # Annotation fields
5389                annotation_fields = annotations[annotation]
5390                if not annotation_fields:
5391                    annotation_fields = {"INFO": None}
5392
5393                log.debug(f"Annotation '{annotation_name}'")
5394                log.debug(
5395                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5396                )
5397
5398                # Create Database
5399                database = Database(
5400                    database=annotation,
5401                    databases_folders=databases_folders,
5402                    assembly=assembly,
5403                )
5404
5405                # Find files
5406                parquet_file = database.get_database()
5407                parquet_hdr_file = database.get_header_file()
5408                parquet_type = database.get_type()
5409
5410                # Check if files exists
5411                if not parquet_file or not parquet_hdr_file:
5412                    log.error("Annotation failed: file not found")
5413                    raise ValueError("Annotation failed: file not found")
5414                else:
5415                    # Get parquet connexion
5416                    parquet_sql_attach = database.get_sql_database_attach(
5417                        output="query"
5418                    )
5419                    if parquet_sql_attach:
5420                        self.conn.execute(parquet_sql_attach)
5421                    parquet_file_link = database.get_sql_database_link()
5422                    # Log
5423                    log.debug(
5424                        f"Annotation '{annotation_name}' - file: "
5425                        + str(parquet_file)
5426                        + " and "
5427                        + str(parquet_hdr_file)
5428                    )
5429
5430                    # Database full header columns
5431                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5432                        parquet_hdr_file
5433                    )
5434                    # Log
5435                    log.debug(
5436                        "Annotation database header columns : "
5437                        + str(parquet_hdr_vcf_header_columns)
5438                    )
5439
5440                    # Load header as VCF object
5441                    parquet_hdr_vcf_header_infos = database.get_header().infos
5442                    # Log
5443                    log.debug(
5444                        "Annotation database header: "
5445                        + str(parquet_hdr_vcf_header_infos)
5446                    )
5447
5448                    # Get extra infos
5449                    parquet_columns = database.get_extra_columns()
5450                    # Log
5451                    log.debug("Annotation database Columns: " + str(parquet_columns))
5452
5453                    # Add extra columns if "ALL" in annotation_fields
5454                    # if "ALL" in annotation_fields:
5455                    #     allow_add_extra_column = True
5456                    if "ALL" in annotation_fields and database.get_extra_columns():
5457                        for extra_column in database.get_extra_columns():
5458                            if (
5459                                extra_column not in annotation_fields
5460                                and extra_column.replace("INFO/", "")
5461                                not in parquet_hdr_vcf_header_infos
5462                            ):
5463                                parquet_hdr_vcf_header_infos[extra_column] = (
5464                                    vcf.parser._Info(
5465                                        extra_column,
5466                                        ".",
5467                                        "String",
5468                                        f"{extra_column} description",
5469                                        "unknown",
5470                                        "unknown",
5471                                        self.code_type_map["String"],
5472                                    )
5473                                )
5474
5475                    # For all fields in database
5476                    annotation_fields_all = False
5477                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5478                        annotation_fields_all = True
5479                        annotation_fields = {
5480                            key: key for key in parquet_hdr_vcf_header_infos
5481                        }
5482
5483                        log.debug(
5484                            "Annotation database header - All annotations added: "
5485                            + str(annotation_fields)
5486                        )
5487
5488                    # Init
5489
5490                    # List of annotation fields to use
5491                    sql_query_annotation_update_info_sets = []
5492
5493                    # List of annotation to agregate
5494                    sql_query_annotation_to_agregate = []
5495
5496                    # Number of fields
5497                    nb_annotation_field = 0
5498
5499                    # Annotation fields processed
5500                    annotation_fields_processed = []
5501
5502                    # Columns mapping
5503                    map_columns = database.map_columns(
5504                        columns=annotation_fields, prefixes=["INFO/"]
5505                    )
5506
5507                    # Query dict for fields to remove (update option)
5508                    query_dict_remove = {}
5509
5510                    # Fetch Anotation fields
5511                    for annotation_field in annotation_fields:
5512
5513                        # annotation_field_column
5514                        annotation_field_column = map_columns.get(
5515                            annotation_field, "INFO"
5516                        )
5517
5518                        # field new name, if parametered
5519                        annotation_fields_new_name = annotation_fields.get(
5520                            annotation_field, annotation_field
5521                        )
5522                        if not annotation_fields_new_name:
5523                            annotation_fields_new_name = annotation_field
5524
5525                        # To annotate
5526                        # force_update_annotation = True
5527                        # force_append_annotation = True
5528                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5529                        if annotation_field in parquet_hdr_vcf_header_infos and (
5530                            force_update_annotation
5531                            or force_append_annotation
5532                            or (
5533                                annotation_fields_new_name
5534                                not in self.get_header().infos
5535                            )
5536                        ):
5537
5538                            # Add field to annotation to process list
5539                            annotation_fields_processed.append(
5540                                annotation_fields_new_name
5541                            )
5542
5543                            # explode infos for the field
5544                            annotation_fields_new_name_info_msg = ""
5545                            if (
5546                                force_update_annotation
5547                                and annotation_fields_new_name
5548                                in self.get_header().infos
5549                            ):
5550                                # Remove field from INFO
5551                                query = f"""
5552                                    UPDATE {table_variants} as table_variants
5553                                    SET INFO = REGEXP_REPLACE(
5554                                                concat(table_variants.INFO,''),
5555                                                ';*{annotation_fields_new_name}=[^;]*',
5556                                                ''
5557                                                )
5558                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5559                                """
5560                                annotation_fields_new_name_info_msg = " [update]"
5561                                query_dict_remove[
5562                                    f"remove 'INFO/{annotation_fields_new_name}'"
5563                                ] = query
5564
5565                            # Sep between fields in INFO
5566                            nb_annotation_field += 1
5567                            if nb_annotation_field > 1:
5568                                annotation_field_sep = ";"
5569                            else:
5570                                annotation_field_sep = ""
5571
5572                            log.info(
5573                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5574                            )
5575
5576                            # Add INFO field to header
5577                            parquet_hdr_vcf_header_infos_number = (
5578                                parquet_hdr_vcf_header_infos[annotation_field].num
5579                                or "."
5580                            )
5581                            parquet_hdr_vcf_header_infos_type = (
5582                                parquet_hdr_vcf_header_infos[annotation_field].type
5583                                or "String"
5584                            )
5585                            parquet_hdr_vcf_header_infos_description = (
5586                                parquet_hdr_vcf_header_infos[annotation_field].desc
5587                                or f"{annotation_field} description"
5588                            )
5589                            parquet_hdr_vcf_header_infos_source = (
5590                                parquet_hdr_vcf_header_infos[annotation_field].source
5591                                or "unknown"
5592                            )
5593                            parquet_hdr_vcf_header_infos_version = (
5594                                parquet_hdr_vcf_header_infos[annotation_field].version
5595                                or "unknown"
5596                            )
5597
5598                            vcf_reader.infos[annotation_fields_new_name] = (
5599                                vcf.parser._Info(
5600                                    annotation_fields_new_name,
5601                                    parquet_hdr_vcf_header_infos_number,
5602                                    parquet_hdr_vcf_header_infos_type,
5603                                    parquet_hdr_vcf_header_infos_description,
5604                                    parquet_hdr_vcf_header_infos_source,
5605                                    parquet_hdr_vcf_header_infos_version,
5606                                    self.code_type_map[
5607                                        parquet_hdr_vcf_header_infos_type
5608                                    ],
5609                                )
5610                            )
5611
5612                            # Append
5613                            if force_append_annotation:
5614                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5615                            else:
5616                                query_case_when_append = ""
5617
5618                            # Annotation/Update query fields
5619                            # Found in INFO column
5620                            if (
5621                                annotation_field_column == "INFO"
5622                                and "INFO" in parquet_hdr_vcf_header_columns
5623                            ):
5624                                sql_query_annotation_update_info_sets.append(
5625                                    f"""
5626                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5627                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5628                                        ELSE ''
5629                                    END
5630                                """
5631                                )
5632                            # Found in a specific column
5633                            else:
5634                                sql_query_annotation_update_info_sets.append(
5635                                    f"""
5636                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5637                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5638                                        ELSE ''
5639                                    END
5640                                """
5641                                )
5642                                sql_query_annotation_to_agregate.append(
5643                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5644                                )
5645
5646                        # Not to annotate
5647                        else:
5648
5649                            if force_update_annotation:
5650                                annotation_message = "forced"
5651                            else:
5652                                annotation_message = "skipped"
5653
5654                            if annotation_field not in parquet_hdr_vcf_header_infos:
5655                                log.warning(
5656                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5657                                )
5658                            if annotation_fields_new_name in self.get_header().infos:
5659                                log.warning(
5660                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5661                                )
5662
5663                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5664                    # allow_annotation_full_info = True
5665                    allow_annotation_full_info = not force_append_annotation
5666
5667                    if parquet_type in ["regions"]:
5668                        allow_annotation_full_info = False
5669
5670                    if (
5671                        allow_annotation_full_info
5672                        and nb_annotation_field == len(annotation_fields)
5673                        and annotation_fields_all
5674                        and (
5675                            "INFO" in parquet_hdr_vcf_header_columns
5676                            and "INFO" in database.get_extra_columns()
5677                        )
5678                    ):
5679                        log.debug("Column INFO annotation enabled")
5680                        sql_query_annotation_update_info_sets = []
5681                        sql_query_annotation_update_info_sets.append(
5682                            f" table_parquet.INFO "
5683                        )
5684
5685                    if sql_query_annotation_update_info_sets:
5686
5687                        # Annotate
5688                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5689
5690                        # Join query annotation update info sets for SQL
5691                        sql_query_annotation_update_info_sets_sql = ",".join(
5692                            sql_query_annotation_update_info_sets
5693                        )
5694
5695                        # Check chromosomes list (and variants infos)
5696                        sql_query_chromosomes = f"""
5697                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5698                            FROM {table_variants} as table_variants
5699                            GROUP BY table_variants."#CHROM"
5700                            ORDER BY table_variants."#CHROM"
5701                            """
5702                        sql_query_chromosomes_df = self.conn.execute(
5703                            sql_query_chromosomes
5704                        ).df()
5705                        sql_query_chromosomes_dict = {
5706                            entry["CHROM"]: {
5707                                "count": entry["count_variants"],
5708                                "min": entry["min_variants"],
5709                                "max": entry["max_variants"],
5710                            }
5711                            for index, entry in sql_query_chromosomes_df.iterrows()
5712                        }
5713
5714                        # Init
5715                        nb_of_query = 0
5716                        nb_of_variant_annotated = 0
5717                        query_dict = query_dict_remove
5718
5719                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5720                        for chrom in sql_query_chromosomes_dict:
5721
5722                            # Number of variant by chromosome
5723                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5724                                chrom, {}
5725                            ).get("count", 0)
5726
5727                            log.debug(
5728                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5729                            )
5730
5731                            # Annotation with regions database
5732                            if parquet_type in ["regions"]:
5733                                sql_query_annotation_from_clause = f"""
5734                                    FROM (
5735                                        SELECT 
5736                                            '{chrom}' AS \"#CHROM\",
5737                                            table_variants_from.\"POS\" AS \"POS\",
5738                                            {",".join(sql_query_annotation_to_agregate)}
5739                                        FROM {table_variants} as table_variants_from
5740                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5741                                            table_parquet_from."#CHROM" = '{chrom}'
5742                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5743                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5744                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5745                                                )
5746                                        )
5747                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5748                                        GROUP BY table_variants_from.\"POS\"
5749                                        )
5750                                        as table_parquet
5751                                """
5752
5753                                sql_query_annotation_where_clause = """
5754                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5755                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5756                                """
5757
5758                            # Annotation with variants database
5759                            else:
5760                                sql_query_annotation_from_clause = f"""
5761                                    FROM {parquet_file_link} as table_parquet
5762                                """
5763                                sql_query_annotation_where_clause = f"""
5764                                    table_variants."#CHROM" = '{chrom}'
5765                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5766                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5767                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5768                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5769                                """
5770
5771                            # Create update query
5772                            sql_query_annotation_chrom_interval_pos = f"""
5773                                UPDATE {table_variants} as table_variants
5774                                    SET INFO = 
5775                                        concat(
5776                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5777                                                THEN table_variants.INFO
5778                                                ELSE ''
5779                                            END
5780                                            ,
5781                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5782                                                        AND (
5783                                                        concat({sql_query_annotation_update_info_sets_sql})
5784                                                        )
5785                                                        NOT IN ('','.') 
5786                                                    THEN ';'
5787                                                    ELSE ''
5788                                            END
5789                                            ,
5790                                            {sql_query_annotation_update_info_sets_sql}
5791                                            )
5792                                    {sql_query_annotation_from_clause}
5793                                    WHERE {sql_query_annotation_where_clause}
5794                                    ;
5795                                """
5796
5797                            # Add update query to dict
5798                            query_dict[
5799                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5800                            ] = sql_query_annotation_chrom_interval_pos
5801
5802                        nb_of_query = len(query_dict)
5803                        num_query = 0
5804
5805                        # SET max_expression_depth TO x
5806                        self.conn.execute("SET max_expression_depth TO 10000")
5807
5808                        for query_name in query_dict:
5809                            query = query_dict[query_name]
5810                            num_query += 1
5811                            log.info(
5812                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5813                            )
5814                            result = self.conn.execute(query)
5815                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5816                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5817                            log.info(
5818                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5819                            )
5820
5821                        log.info(
5822                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5823                        )
5824
5825                    else:
5826
5827                        log.info(
5828                            f"Annotation '{annotation_name}' - No Annotations available"
5829                        )
5830
5831                    log.debug("Final header: " + str(vcf_reader.infos))
5832
5833        # Remove added columns
5834        for added_column in added_columns:
5835            self.drop_column(column=added_column)
5836
5837    def annotation_splice(self, threads: int = None) -> None:
5838        """
5839        This function annotate with snpEff
5840
5841        :param threads: The number of threads to use
5842        :return: the value of the variable "return_value".
5843        """
5844
5845        # DEBUG
5846        log.debug("Start annotation with splice tools")
5847
5848        # Threads
5849        if not threads:
5850            threads = self.get_threads()
5851        log.debug("Threads: " + str(threads))
5852
5853        # DEBUG
5854        delete_tmp = True
5855        if self.get_config().get("verbosity", "warning") in ["debug"]:
5856            delete_tmp = False
5857            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5858
5859        # Config
5860        config = self.get_config()
5861        log.debug("Config: " + str(config))
5862        splice_config = config.get("tools", {}).get("splice", {})
5863        if not splice_config:
5864            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5865        if not splice_config:
5866            msg_err = "No Splice tool config"
5867            log.error(msg_err)
5868            raise ValueError(msg_err)
5869        log.debug(f"splice_config={splice_config}")
5870
5871        # Config - Folders - Databases
5872        databases_folders = (
5873            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5874        )
5875        log.debug("Databases annotations: " + str(databases_folders))
5876
5877        # Splice docker image
5878        splice_docker_image = splice_config.get("docker").get("image")
5879
5880        # Pull splice image if it's not already there
5881        if not check_docker_image_exists(splice_docker_image):
5882            log.warning(
5883                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5884            )
5885            try:
5886                command(f"docker pull {splice_config.get('docker').get('image')}")
5887            except subprocess.CalledProcessError:
5888                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5889                log.error(msg_err)
5890                raise ValueError(msg_err)
5891                return None
5892
5893        # Config - splice databases
5894        splice_databases = (
5895            config.get("folders", {})
5896            .get("databases", {})
5897            .get("splice", DEFAULT_SPLICE_FOLDER)
5898        )
5899        splice_databases = full_path(splice_databases)
5900
5901        # Param
5902        param = self.get_param()
5903        log.debug("Param: " + str(param))
5904
5905        # Param
5906        options = param.get("annotation", {}).get("splice", {})
5907        log.debug("Options: " + str(options))
5908
5909        # Data
5910        table_variants = self.get_table_variants()
5911
5912        # Check if not empty
5913        log.debug("Check if not empty")
5914        sql_query_chromosomes = (
5915            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5916        )
5917        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5918            log.info("VCF empty")
5919            return None
5920
5921        # Export in VCF
5922        log.debug("Create initial file to annotate")
5923
5924        # Create output folder
5925        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
5926        if not os.path.exists(output_folder):
5927            Path(output_folder).mkdir(parents=True, exist_ok=True)
5928
5929        # Create tmp VCF file
5930        tmp_vcf = NamedTemporaryFile(
5931            prefix=self.get_prefix(),
5932            dir=output_folder,
5933            suffix=".vcf",
5934            delete=False,
5935        )
5936        tmp_vcf_name = tmp_vcf.name
5937
5938        # VCF header
5939        header = self.get_header()
5940
5941        # Existing annotations
5942        for vcf_annotation in self.get_header().infos:
5943
5944            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5945            log.debug(
5946                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5947            )
5948
5949        # Memory limit
5950        if config.get("memory", None):
5951            memory_limit = config.get("memory", "8G").upper()
5952            # upper()
5953        else:
5954            memory_limit = "8G"
5955        log.debug(f"memory_limit: {memory_limit}")
5956
5957        # Export VCF file
5958        self.export_variant_vcf(
5959            vcf_file=tmp_vcf_name,
5960            remove_info=True,
5961            add_samples=True,
5962            index=False,
5963        )
5964
5965        # Create docker container and launch splice analysis
5966        if splice_config:
5967
5968            # Splice mount folders
5969            mount_folders = splice_config.get("mount", {})
5970
5971            # Genome mount
5972            mount_folders[
5973                config.get("folders", {})
5974                .get("databases", {})
5975                .get("genomes", DEFAULT_GENOME_FOLDER)
5976            ] = "ro"
5977
5978            # SpliceAI mount
5979            mount_folders[
5980                config.get("folders", {})
5981                .get("databases", {})
5982                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
5983            ] = "ro"
5984
5985            # Genome mount
5986            mount_folders[
5987                config.get("folders", {})
5988                .get("databases", {})
5989                .get("spip", DEFAULT_SPIP_FOLDER)
5990            ] = "ro"
5991
5992            # Mount folders
5993            mount = []
5994
5995            # Config mount
5996            mount = [
5997                f"-v {full_path(path)}:{full_path(path)}:{mode}"
5998                for path, mode in mount_folders.items()
5999            ]
6000
6001            if any(value for value in splice_config.values() if value is None):
6002                log.warning("At least one splice config parameter is empty")
6003                return None
6004
6005            # Params in splice nf
6006            def check_values(dico: dict):
6007                """
6008                Ensure parameters for NF splice pipeline
6009                """
6010                for key, val in dico.items():
6011                    if key == "genome":
6012                        if any(
6013                            assemb in options.get("genome", {})
6014                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6015                        ):
6016                            yield f"--{key} hg19"
6017                        elif any(
6018                            assemb in options.get("genome", {})
6019                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6020                        ):
6021                            yield f"--{key} hg38"
6022                    elif (
6023                        (isinstance(val, str) and val)
6024                        or isinstance(val, int)
6025                        or isinstance(val, bool)
6026                    ):
6027                        yield f"--{key} {val}"
6028
6029            # Genome
6030            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6031            options["genome"] = genome
6032
6033            # NF params
6034            nf_params = []
6035
6036            # Add options
6037            if options:
6038                nf_params = list(check_values(options))
6039                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6040            else:
6041                log.debug("No NF params provided")
6042
6043            # Add threads
6044            if "threads" not in options.keys():
6045                nf_params.append(f"--threads {threads}")
6046
6047            # Genome path
6048            genome_path = find_genome(
6049                config.get("folders", {})
6050                .get("databases", {})
6051                .get("genomes", DEFAULT_GENOME_FOLDER),
6052                file=f"{genome}.fa",
6053            )
6054            # Add genome path
6055            if not genome_path:
6056                raise ValueError(
6057                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6058                )
6059            else:
6060                log.debug(f"Genome: {genome_path}")
6061                nf_params.append(f"--genome_path {genome_path}")
6062
6063            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6064                """
6065                Setting up updated databases for SPiP and SpliceAI
6066                """
6067
6068                try:
6069
6070                    # SpliceAI assembly transcriptome
6071                    spliceai_assembly = os.path.join(
6072                        config.get("folders", {})
6073                        .get("databases", {})
6074                        .get("spliceai", {}),
6075                        options.get("genome"),
6076                        "transcriptome",
6077                    )
6078                    spip_assembly = options.get("genome")
6079
6080                    spip = find(
6081                        f"transcriptome_{spip_assembly}.RData",
6082                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6083                    )
6084                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6085                    log.debug(f"SPiP annotations: {spip}")
6086                    log.debug(f"SpliceAI annotations: {spliceai}")
6087                    if spip and spliceai:
6088                        return [
6089                            f"--spip_transcriptome {spip}",
6090                            f"--spliceai_annotations {spliceai}",
6091                        ]
6092                    else:
6093                        # TODO crash and go on with basic annotations ?
6094                        # raise ValueError(
6095                        #     "Can't find splice databases in configuration EXIT"
6096                        # )
6097                        log.warning(
6098                            "Can't find splice databases in configuration, use annotations file from image"
6099                        )
6100                except TypeError:
6101                    log.warning(
6102                        "Can't find splice databases in configuration, use annotations file from image"
6103                    )
6104                    return []
6105
6106            # Add options, check if transcriptome option have already beend provided
6107            if (
6108                "spip_transcriptome" not in nf_params
6109                and "spliceai_transcriptome" not in nf_params
6110            ):
6111                splice_reference = splice_annotations(options, config)
6112                if splice_reference:
6113                    nf_params.extend(splice_reference)
6114
6115            nf_params.append(f"--output_folder {output_folder}")
6116
6117            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6118            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6119            log.debug(cmd)
6120
6121            splice_config["docker"]["command"] = cmd
6122
6123            docker_cmd = get_bin_command(
6124                tool="splice",
6125                bin_type="docker",
6126                config=config,
6127                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6128                add_options=f"--name {random_uuid} {' '.join(mount)}",
6129            )
6130
6131            # Docker debug
6132            # if splice_config.get("rm_container"):
6133            #     rm_container = "--rm"
6134            # else:
6135            #     rm_container = ""
6136            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6137
6138            log.debug(docker_cmd)
6139            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6140            log.debug(res.stdout)
6141            if res.stderr:
6142                log.error(res.stderr)
6143            res.check_returncode()
6144        else:
6145            log.warning(f"Splice tool configuration not found: {config}")
6146
6147        # Update variants
6148        log.info("Annotation - Updating...")
6149        # Test find output vcf
6150        log.debug(
6151            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6152        )
6153        output_vcf = []
6154        # Wrong folder to look in
6155        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6156            if (
6157                files
6158                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6159            ):
6160                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6161        # log.debug(os.listdir(options.get("output_folder")))
6162        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6163        if not output_vcf:
6164            log.debug(
6165                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6166            )
6167        else:
6168            # Get new header from annotated vcf
6169            log.debug(f"Initial header: {len(header.infos)} fields")
6170            # Create new header with splice infos
6171            new_vcf = Variants(input=output_vcf[0])
6172            new_vcf_header = new_vcf.get_header().infos
6173            for keys, infos in new_vcf_header.items():
6174                if keys not in header.infos.keys():
6175                    header.infos[keys] = infos
6176            log.debug(f"New header: {len(header.infos)} fields")
6177            log.debug(f"Splice tmp output: {output_vcf[0]}")
6178            self.update_from_vcf(output_vcf[0])
6179
6180        # Remove folder
6181        remove_if_exists(output_folder)
6182
6183    ###
6184    # Prioritization
6185    ###
6186
6187    def get_config_default(self, name: str) -> dict:
6188        """
6189        The function `get_config_default` returns a dictionary containing default configurations for
6190        various calculations and prioritizations.
6191
6192        :param name: The `get_config_default` function returns a dictionary containing default
6193        configurations for different calculations and prioritizations. The `name` parameter is used to
6194        specify which specific configuration to retrieve from the dictionary
6195        :type name: str
6196        :return: The function `get_config_default` returns a dictionary containing default configuration
6197        settings for different calculations and prioritizations. The specific configuration settings are
6198        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6199        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6200        returned. If there is no match, an empty dictionary is returned.
6201        """
6202
6203        config_default = {
6204            "calculations": {
6205                "variant_chr_pos_alt_ref": {
6206                    "type": "sql",
6207                    "name": "variant_chr_pos_alt_ref",
6208                    "description": "Create a variant ID with chromosome, position, alt and ref",
6209                    "available": False,
6210                    "output_column_name": "variant_chr_pos_alt_ref",
6211                    "output_column_type": "String",
6212                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6213                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6214                    "operation_info": True,
6215                },
6216                "VARTYPE": {
6217                    "type": "sql",
6218                    "name": "VARTYPE",
6219                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6220                    "available": True,
6221                    "output_column_name": "VARTYPE",
6222                    "output_column_type": "String",
6223                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6224                    "operation_query": """
6225                            CASE
6226                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6227                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6228                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6229                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6230                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6231                                ELSE 'UNDEFINED'
6232                            END
6233                            """,
6234                    "info_fields": ["SVTYPE"],
6235                    "operation_info": True,
6236                },
6237                "snpeff_hgvs": {
6238                    "type": "python",
6239                    "name": "snpeff_hgvs",
6240                    "description": "HGVS nomenclatures from snpEff annotation",
6241                    "available": True,
6242                    "function_name": "calculation_extract_snpeff_hgvs",
6243                    "function_params": [],
6244                },
6245                "NOMEN": {
6246                    "type": "python",
6247                    "name": "NOMEN",
6248                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6249                    "available": True,
6250                    "function_name": "calculation_extract_nomen",
6251                    "function_params": [],
6252                },
6253                "FINDBYPIPELINE": {
6254                    "type": "python",
6255                    "name": "FINDBYPIPELINE",
6256                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6257                    "available": True,
6258                    "function_name": "calculation_find_by_pipeline",
6259                    "function_params": ["findbypipeline"],
6260                },
6261                "FINDBYSAMPLE": {
6262                    "type": "python",
6263                    "name": "FINDBYSAMPLE",
6264                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6265                    "available": True,
6266                    "function_name": "calculation_find_by_pipeline",
6267                    "function_params": ["findbysample"],
6268                },
6269                "GENOTYPECONCORDANCE": {
6270                    "type": "python",
6271                    "name": "GENOTYPECONCORDANCE",
6272                    "description": "Concordance of genotype for multi caller VCF",
6273                    "available": True,
6274                    "function_name": "calculation_genotype_concordance",
6275                    "function_params": [],
6276                },
6277                "BARCODE": {
6278                    "type": "python",
6279                    "name": "BARCODE",
6280                    "description": "BARCODE as VaRank tool",
6281                    "available": True,
6282                    "function_name": "calculation_barcode",
6283                    "function_params": [],
6284                },
6285                "BARCODEFAMILY": {
6286                    "type": "python",
6287                    "name": "BARCODEFAMILY",
6288                    "description": "BARCODEFAMILY as VaRank tool",
6289                    "available": True,
6290                    "function_name": "calculation_barcode_family",
6291                    "function_params": ["BCF"],
6292                },
6293                "TRIO": {
6294                    "type": "python",
6295                    "name": "TRIO",
6296                    "description": "Inheritance for a trio family",
6297                    "available": True,
6298                    "function_name": "calculation_trio",
6299                    "function_params": [],
6300                },
6301                "VAF": {
6302                    "type": "python",
6303                    "name": "VAF",
6304                    "description": "Variant Allele Frequency (VAF) harmonization",
6305                    "available": True,
6306                    "function_name": "calculation_vaf_normalization",
6307                    "function_params": [],
6308                },
6309                "VAF_stats": {
6310                    "type": "python",
6311                    "name": "VAF_stats",
6312                    "description": "Variant Allele Frequency (VAF) statistics",
6313                    "available": True,
6314                    "function_name": "calculation_genotype_stats",
6315                    "function_params": ["VAF"],
6316                },
6317                "DP_stats": {
6318                    "type": "python",
6319                    "name": "DP_stats",
6320                    "description": "Depth (DP) statistics",
6321                    "available": True,
6322                    "function_name": "calculation_genotype_stats",
6323                    "function_params": ["DP"],
6324                },
6325                "variant_id": {
6326                    "type": "python",
6327                    "name": "variant_id",
6328                    "description": "Variant ID generated from variant position and type",
6329                    "available": True,
6330                    "function_name": "calculation_variant_id",
6331                    "function_params": [],
6332                },
6333            },
6334            "prioritizations": {
6335                "default": {
6336                    "filter": [
6337                        {
6338                            "type": "notequals",
6339                            "value": "!PASS|\\.",
6340                            "score": 0,
6341                            "flag": "FILTERED",
6342                            "comment": ["Bad variant quality"],
6343                        },
6344                        {
6345                            "type": "equals",
6346                            "value": "REJECT",
6347                            "score": -20,
6348                            "flag": "PASS",
6349                            "comment": ["Bad variant quality"],
6350                        },
6351                    ],
6352                    "DP": [
6353                        {
6354                            "type": "gte",
6355                            "value": "50",
6356                            "score": 5,
6357                            "flag": "PASS",
6358                            "comment": ["DP higher than 50"],
6359                        }
6360                    ],
6361                    "ANN": [
6362                        {
6363                            "type": "contains",
6364                            "value": "HIGH",
6365                            "score": 5,
6366                            "flag": "PASS",
6367                            "comment": [
6368                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6369                            ],
6370                        },
6371                        {
6372                            "type": "contains",
6373                            "value": "MODERATE",
6374                            "score": 3,
6375                            "flag": "PASS",
6376                            "comment": [
6377                                "A non-disruptive variant that might change protein effectiveness"
6378                            ],
6379                        },
6380                        {
6381                            "type": "contains",
6382                            "value": "LOW",
6383                            "score": 0,
6384                            "flag": "FILTERED",
6385                            "comment": [
6386                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6387                            ],
6388                        },
6389                        {
6390                            "type": "contains",
6391                            "value": "MODIFIER",
6392                            "score": 0,
6393                            "flag": "FILTERED",
6394                            "comment": [
6395                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6396                            ],
6397                        },
6398                    ],
6399                }
6400            },
6401        }
6402
6403        return config_default.get(name, None)
6404
6405    def get_config_json(
6406        self, name: str, config_dict: dict = {}, config_file: str = None
6407    ) -> dict:
6408        """
6409        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6410        default values, a dictionary, and a file.
6411
6412        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6413        the name of the configuration. It is used to identify and retrieve the configuration settings
6414        for a specific component or module
6415        :type name: str
6416        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6417        dictionary that allows you to provide additional configuration settings or overrides. When you
6418        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6419        the key is the configuration setting you want to override or
6420        :type config_dict: dict
6421        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6422        specify the path to a configuration file that contains additional settings. If provided, the
6423        function will read the contents of this file and update the configuration dictionary with the
6424        values found in the file, overriding any existing values with the
6425        :type config_file: str
6426        :return: The function `get_config_json` returns a dictionary containing the configuration
6427        settings.
6428        """
6429
6430        # Create with default prioritizations
6431        config_default = self.get_config_default(name=name)
6432        configuration = config_default
6433        # log.debug(f"configuration={configuration}")
6434
6435        # Replace prioritizations from dict
6436        for config in config_dict:
6437            configuration[config] = config_dict[config]
6438
6439        # Replace prioritizations from file
6440        config_file = full_path(config_file)
6441        if config_file:
6442            if os.path.exists(config_file):
6443                with open(config_file) as config_file_content:
6444                    config_file_dict = json.load(config_file_content)
6445                for config in config_file_dict:
6446                    configuration[config] = config_file_dict[config]
6447            else:
6448                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6449                log.error(msg_error)
6450                raise ValueError(msg_error)
6451
6452        return configuration
6453
6454    # def get_prioritizations_config(self, prioritizations_config_dict:dict = {}, prioritizations_config_file:str = None) -> dict:
6455
6456    #     # Create with default prioritizations
6457    #     prioritizations_config = self.get_config_default("prioritization")
6458
6459    #     # Replace prioritizations from dict
6460    #     for prioritization_config in prioritizations_config_dict:
6461    #         prioritizations_config[prioritization_config] = prioritizations_config_dict[prioritization_config]
6462
6463    #     # Replace prioritizations from file
6464    #     prioritizations_config_file = full_path(prioritizations_config_file)
6465    #     if prioritizations_config_file:
6466    #         if os.path.exists(prioritizations_config_file):
6467    #             with open(prioritizations_config_file) as prioritizations_config_file_content:
6468    #                 prioritizations_config_file_dict = json.load(prioritizations_config_file_content)
6469    #             for prioritization_config in prioritizations_config_file_dict:
6470    #                 prioritizations_config[prioritization_config] = prioritizations_config_file_dict[prioritization_config]
6471    #         else:
6472    #             log.error(f"Prioritizations config file '{prioritizations_config_file}' does NOT exist")
6473    #             raise ValueError(f"Prioritizations config file '{prioritizations_config_file}' does NOT exist")
6474
6475    #     return prioritizations_config
6476
6477    def prioritization(self) -> None:
6478        """
6479        It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other
6480        INFO fields
6481        """
6482
6483        # Config
6484        config = self.get_config()
6485
6486        # Param
6487        param = self.get_param()
6488
6489        # Quick Prioritizations
6490        # prioritizations = param.get("prioritization", {}).get("prioritizations", "")
6491
6492        # Configuration profiles
6493        prioritization_config_file = param.get("prioritization", {}).get(
6494            "prioritization_config", None
6495        )
6496        prioritization_config_file = full_path(prioritization_config_file)
6497        prioritizations_config = self.get_config_json(
6498            name="prioritizations", config_file=prioritization_config_file
6499        )
6500
6501        # Prioritization options
6502        profiles = param.get("prioritization", {}).get("profiles", [])
6503        if isinstance(profiles, str):
6504            profiles = profiles.split(",")
6505        pzfields = param.get("prioritization", {}).get(
6506            "pzfields", ["PZFlag", "PZScore"]
6507        )
6508        if isinstance(pzfields, str):
6509            pzfields = pzfields.split(",")
6510        default_profile = param.get("prioritization", {}).get("default_profile", None)
6511        pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_")
6512        prioritization_score_mode = param.get("prioritization", {}).get(
6513            "prioritization_score_mode", "HOWARD"
6514        )
6515
6516        # Quick Prioritizations
6517        # prioritizations = param.get("prioritization", {}).get("prioritizations", None)
6518        prioritizations = param.get("prioritizations", None)
6519        if prioritizations:
6520            log.info("Quick Prioritization:")
6521            for profile in prioritizations.split(","):
6522                if profile not in profiles:
6523                    profiles.append(profile)
6524                    log.info(f"   {profile}")
6525
6526        # If profile "ALL" provided, all profiles in the config profiles
6527        if "ALL" in profiles:
6528            profiles = list(prioritizations_config.keys())
6529
6530        for profile in profiles:
6531            if prioritizations_config.get(profile, None):
6532                log.debug(f"Profile '{profile}' configured")
6533            else:
6534                msg_error = f"Profile '{profile}' NOT configured"
6535                log.error(msg_error)
6536                raise ValueError(msg_error)
6537
6538        if profiles:
6539            log.info(f"Prioritization... ")
6540        else:
6541            log.debug(f"No profile defined")
6542            return
6543
6544        if not default_profile and len(profiles):
6545            default_profile = profiles[0]
6546
6547        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6548        log.debug("Profiles to check: " + str(list(profiles)))
6549
6550        # Variables
6551        table_variants = self.get_table_variants(clause="update")
6552
6553        # Added columns
6554        added_columns = []
6555
6556        # Create list of PZfields
6557        # List of PZFields
6558        list_of_pzfields_original = pzfields + [
6559            pzfield + pzfields_sep + profile
6560            for pzfield in pzfields
6561            for profile in profiles
6562        ]
6563        list_of_pzfields = []
6564        log.debug(f"{list_of_pzfields_original}")
6565
6566        # Remove existing PZfields to use if exists
6567        for pzfield in list_of_pzfields_original:
6568            if self.get_header().infos.get(pzfield, None) is None:
6569                list_of_pzfields.append(pzfield)
6570                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6571            else:
6572                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6573
6574        if list_of_pzfields:
6575
6576            # Explode Infos fields
6577            explode_infos_prefix = self.get_explode_infos_prefix()
6578            added_columns += self.explode_infos(prefix=explode_infos_prefix)
6579            extra_infos = self.get_extra_infos()
6580
6581            # PZfields tags description
6582            PZfields_INFOS = {
6583                "PZTags": {
6584                    "ID": "PZTags",
6585                    "Number": ".",
6586                    "Type": "String",
6587                    "Description": "Variant tags based on annotation criteria",
6588                },
6589                "PZScore": {
6590                    "ID": "PZScore",
6591                    "Number": 1,
6592                    "Type": "Integer",
6593                    "Description": "Variant score based on annotation criteria",
6594                },
6595                "PZFlag": {
6596                    "ID": "PZFlag",
6597                    "Number": 1,
6598                    "Type": "String",
6599                    "Description": "Variant flag based on annotation criteria",
6600                },
6601                "PZComment": {
6602                    "ID": "PZComment",
6603                    "Number": ".",
6604                    "Type": "String",
6605                    "Description": "Variant comment based on annotation criteria",
6606                },
6607                "PZInfos": {
6608                    "ID": "PZInfos",
6609                    "Number": ".",
6610                    "Type": "String",
6611                    "Description": "Variant infos based on annotation criteria",
6612                },
6613            }
6614
6615            # Create INFO fields if not exist
6616            for field in PZfields_INFOS:
6617                field_ID = PZfields_INFOS[field]["ID"]
6618                field_description = PZfields_INFOS[field]["Description"]
6619                if field_ID not in self.get_header().infos and field_ID in pzfields:
6620                    field_description = (
6621                        PZfields_INFOS[field]["Description"]
6622                        + f", profile {default_profile}"
6623                    )
6624                    self.get_header().infos[field_ID] = vcf.parser._Info(
6625                        field_ID,
6626                        PZfields_INFOS[field]["Number"],
6627                        PZfields_INFOS[field]["Type"],
6628                        field_description,
6629                        "unknown",
6630                        "unknown",
6631                        code_type_map[PZfields_INFOS[field]["Type"]],
6632                    )
6633
6634            # Create INFO fields if not exist for each profile
6635            for profile in prioritizations_config:
6636                if profile in profiles or profiles == []:
6637                    for field in PZfields_INFOS:
6638                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6639                        field_description = (
6640                            PZfields_INFOS[field]["Description"]
6641                            + f", profile {profile}"
6642                        )
6643                        if (
6644                            field_ID not in self.get_header().infos
6645                            and field in pzfields
6646                        ):
6647                            self.get_header().infos[field_ID] = vcf.parser._Info(
6648                                field_ID,
6649                                PZfields_INFOS[field]["Number"],
6650                                PZfields_INFOS[field]["Type"],
6651                                field_description,
6652                                "unknown",
6653                                "unknown",
6654                                code_type_map[PZfields_INFOS[field]["Type"]],
6655                            )
6656
6657            # Header
6658            for pzfield in list_of_pzfields:
6659                if re.match("PZScore.*", pzfield):
6660                    added_column = self.add_column(
6661                        table_name=table_variants,
6662                        column_name=pzfield,
6663                        column_type="INTEGER",
6664                        default_value="0",
6665                    )
6666                elif re.match("PZFlag.*", pzfield):
6667                    added_column = self.add_column(
6668                        table_name=table_variants,
6669                        column_name=pzfield,
6670                        column_type="BOOLEAN",
6671                        default_value="1",
6672                    )
6673                else:
6674                    added_column = self.add_column(
6675                        table_name=table_variants,
6676                        column_name=pzfield,
6677                        column_type="STRING",
6678                        default_value="''",
6679                    )
6680                added_columns.append(added_column)
6681
6682            # Profiles
6683            if profiles:
6684
6685                # foreach profile in configuration file
6686                for profile in prioritizations_config:
6687
6688                    # If profile is asked in param, or ALL are asked (empty profile [])
6689                    if profile in profiles or profiles == []:
6690                        log.info(f"Profile '{profile}'")
6691
6692                        sql_set_info_option = ""
6693
6694                        sql_set_info = []
6695
6696                        # PZ fields set
6697
6698                        # PZScore
6699                        if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields:
6700                            sql_set_info.append(
6701                                f"""
6702                                    concat(
6703                                        'PZScore{pzfields_sep}{profile}=',
6704                                        PZScore{pzfields_sep}{profile}
6705                                    ) 
6706                                """
6707                            )
6708                            if (
6709                                profile == default_profile
6710                                and "PZScore" in list_of_pzfields
6711                            ):
6712                                sql_set_info.append(
6713                                    f"""
6714                                        concat(
6715                                            'PZScore=',
6716                                            PZScore{pzfields_sep}{profile}
6717                                        )
6718                                    """
6719                                )
6720
6721                        # PZFlag
6722                        if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6723                            sql_set_info.append(
6724                                f"""
6725                                    concat(
6726                                        'PZFlag{pzfields_sep}{profile}=',
6727                                        CASE 
6728                                            WHEN PZFlag{pzfields_sep}{profile}==1
6729                                            THEN 'PASS'
6730                                            WHEN PZFlag{pzfields_sep}{profile}==0
6731                                            THEN 'FILTERED'
6732                                        END
6733                                    ) 
6734                                """
6735                            )
6736                            if (
6737                                profile == default_profile
6738                                and "PZFlag" in list_of_pzfields
6739                            ):
6740                                sql_set_info.append(
6741                                    f"""
6742                                        concat(
6743                                            'PZFlag=',
6744                                            CASE 
6745                                                WHEN PZFlag{pzfields_sep}{profile}==1
6746                                                THEN 'PASS'
6747                                                WHEN PZFlag{pzfields_sep}{profile}==0
6748                                                THEN 'FILTERED'
6749                                            END
6750                                        )
6751                                    """
6752                                )
6753
6754                        # PZComment
6755                        if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields:
6756                            sql_set_info.append(
6757                                f"""
6758                                    CASE
6759                                        WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6760                                        THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile})
6761                                        ELSE ''
6762                                    END
6763                                """
6764                            )
6765                            if (
6766                                profile == default_profile
6767                                and "PZComment" in list_of_pzfields
6768                            ):
6769                                sql_set_info.append(
6770                                    f"""
6771                                        CASE
6772                                            WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6773                                            THEN concat('PZComment=', PZComment{pzfields_sep}{profile})
6774                                            ELSE ''
6775                                        END
6776                                    """
6777                                )
6778
6779                        # PZInfos
6780                        if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields:
6781                            sql_set_info.append(
6782                                f"""
6783                                    CASE
6784                                        WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6785                                        THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile})
6786                                        ELSE ''
6787                                    END
6788                                """
6789                            )
6790                            if (
6791                                profile == default_profile
6792                                and "PZInfos" in list_of_pzfields
6793                            ):
6794                                sql_set_info.append(
6795                                    f"""
6796                                        CASE
6797                                            WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6798                                            THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile})
6799                                            ELSE ''
6800                                        END
6801                                    """
6802                                )
6803
6804                        # Merge PZfields
6805                        sql_set_info_option = ""
6806                        sql_set_sep = ""
6807                        for sql_set in sql_set_info:
6808                            if sql_set_sep:
6809                                sql_set_info_option += f"""
6810                                    , concat('{sql_set_sep}', {sql_set})
6811                                """
6812                            else:
6813                                sql_set_info_option += f"""
6814                                    , {sql_set}
6815                                """
6816                            sql_set_sep = ";"
6817
6818                        sql_queries = []
6819                        for annotation in prioritizations_config[profile]:
6820
6821                            # Check if annotation field is present
6822                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6823                                log.debug(f"Annotation '{annotation}' not in data")
6824                                continue
6825                            else:
6826                                log.debug(f"Annotation '{annotation}' in data")
6827
6828                            # For each criterions
6829                            for criterion in prioritizations_config[profile][
6830                                annotation
6831                            ]:
6832                                criterion_type = criterion["type"]
6833                                criterion_value = criterion["value"]
6834                                criterion_score = criterion.get("score", 0)
6835                                criterion_flag = criterion.get("flag", "PASS")
6836                                criterion_flag_bool = criterion_flag == "PASS"
6837                                criterion_comment = (
6838                                    ", ".join(criterion.get("comment", []))
6839                                    .replace("'", "''")
6840                                    .replace(";", ",")
6841                                    .replace("\t", " ")
6842                                )
6843                                criterion_infos = (
6844                                    str(criterion)
6845                                    .replace("'", "''")
6846                                    .replace(";", ",")
6847                                    .replace("\t", " ")
6848                                )
6849
6850                                sql_set = []
6851                                sql_set_info = []
6852
6853                                # PZ fields set
6854                                if (
6855                                    f"PZScore{pzfields_sep}{profile}"
6856                                    in list_of_pzfields
6857                                ):
6858                                    if prioritization_score_mode == "HOWARD":
6859                                        sql_set.append(
6860                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6861                                        )
6862                                    elif prioritization_score_mode == "VaRank":
6863                                        sql_set.append(
6864                                            f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END"
6865                                        )
6866                                    else:
6867                                        sql_set.append(
6868                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6869                                        )
6870                                if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6871                                    sql_set.append(
6872                                        f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}"
6873                                    )
6874                                if (
6875                                    f"PZComment{pzfields_sep}{profile}"
6876                                    in list_of_pzfields
6877                                ):
6878                                    sql_set.append(
6879                                        f"""
6880                                            PZComment{pzfields_sep}{profile} = 
6881                                                concat(
6882                                                    PZComment{pzfields_sep}{profile},
6883                                                    CASE 
6884                                                        WHEN PZComment{pzfields_sep}{profile}!=''
6885                                                        THEN ', '
6886                                                        ELSE ''
6887                                                    END,
6888                                                    '{criterion_comment}'
6889                                                )
6890                                        """
6891                                    )
6892                                if (
6893                                    f"PZInfos{pzfields_sep}{profile}"
6894                                    in list_of_pzfields
6895                                ):
6896                                    sql_set.append(
6897                                        f"""
6898                                            PZInfos{pzfields_sep}{profile} = 
6899                                                concat(
6900                                                    PZInfos{pzfields_sep}{profile},
6901                                                    '{criterion_infos}'
6902                                                )
6903                                        """
6904                                    )
6905                                sql_set_option = ",".join(sql_set)
6906
6907                                # Criterion and comparison
6908                                try:
6909                                    float(criterion_value)
6910                                    sql_update = f"""
6911                                        UPDATE {table_variants}
6912                                        SET {sql_set_option}
6913                                        WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
6914                                        AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value}
6915                                        """
6916                                except:
6917                                    contains_option = ""
6918                                    if criterion_type == "contains":
6919                                        contains_option = ".*"
6920                                    sql_update = f"""
6921                                        UPDATE {table_variants}
6922                                        SET {sql_set_option}
6923                                        WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
6924                                        """
6925                                sql_queries.append(sql_update)
6926
6927                        # PZTags
6928                        if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields:
6929
6930                            # Create PZFalgs value
6931                            pztags_value = ""
6932                            pztags_sep_default = "|"
6933                            pztags_sep = ""
6934                            for pzfield in pzfields:
6935                                if pzfield not in ["PZTags"]:
6936                                    if (
6937                                        f"{pzfield}{pzfields_sep}{profile}"
6938                                        in list_of_pzfields
6939                                    ):
6940                                        if pzfield in ["PZFlag"]:
6941                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
6942                                                CASE WHEN PZFlag{pzfields_sep}{profile}
6943                                                    THEN 'PASS'
6944                                                    ELSE 'FILTERED'
6945                                                END, '"""
6946                                        else:
6947                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
6948                                        pztags_sep = pztags_sep_default
6949
6950                            # Add Query update for PZFlags
6951                            sql_update_pztags = f"""
6952                                UPDATE {table_variants}
6953                                SET INFO = concat(
6954                                        INFO,
6955                                        CASE WHEN INFO NOT in ('','.')
6956                                                THEN ';'
6957                                                ELSE ''
6958                                        END,
6959                                        'PZTags{pzfields_sep}{profile}={pztags_value}'
6960                                    )
6961                                """
6962                            sql_queries.append(sql_update_pztags)
6963
6964                            # Add Query update for PZFlags for default
6965                            if profile == default_profile:
6966                                sql_update_pztags_default = f"""
6967                                UPDATE {table_variants}
6968                                SET INFO = concat(
6969                                        INFO,
6970                                        ';',
6971                                        'PZTags={pztags_value}'
6972                                    )
6973                                """
6974                                sql_queries.append(sql_update_pztags_default)
6975
6976                        log.info(f"""Profile '{profile}' - Prioritization... """)
6977
6978                        if sql_queries:
6979
6980                            for sql_query in sql_queries:
6981                                log.debug(
6982                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
6983                                )
6984                                self.conn.execute(sql_query)
6985
6986                        log.info(f"""Profile '{profile}' - Update... """)
6987                        sql_query_update = f"""
6988                            UPDATE {table_variants}
6989                            SET INFO =  
6990                                concat(
6991                                    CASE
6992                                        WHEN INFO NOT IN ('','.')
6993                                        THEN concat(INFO, ';')
6994                                        ELSE ''
6995                                    END
6996                                    {sql_set_info_option}
6997                                )
6998                        """
6999                        self.conn.execute(sql_query_update)
7000
7001        else:
7002
7003            log.warning(f"No profiles in parameters")
7004
7005        # Remove added columns
7006        for added_column in added_columns:
7007            self.drop_column(column=added_column)
7008
7009        # Explode INFOS fields into table fields
7010        if self.get_explode_infos():
7011            self.explode_infos(
7012                prefix=self.get_explode_infos_prefix(),
7013                fields=self.get_explode_infos_fields(),
7014                force=True,
7015            )
7016
7017        return
7018
7019    ###
7020    # HGVS
7021    ###
7022
7023    def annotation_hgvs(self, threads: int = None) -> None:
7024        """
7025        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7026        coordinates and alleles.
7027
7028        :param threads: The `threads` parameter is an optional integer that specifies the number of
7029        threads to use for parallel processing. If no value is provided, it will default to the number
7030        of threads obtained from the `get_threads()` method
7031        :type threads: int
7032        """
7033
7034        # Function for each partition of the Dask Dataframe
7035        def partition_function(partition):
7036            """
7037            The function `partition_function` applies the `annotation_hgvs_partition` function to
7038            each row of a DataFrame called `partition`.
7039
7040            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7041            to be processed
7042            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7043            the "partition" dataframe along the axis 1.
7044            """
7045            return partition.apply(annotation_hgvs_partition, axis=1)
7046
7047        def annotation_hgvs_partition(row) -> str:
7048            """
7049            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7050            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7051
7052            :param row: A dictionary-like object that contains the values for the following keys:
7053            :return: a string that contains the HGVS names associated with the given row of data.
7054            """
7055
7056            chr = row["CHROM"]
7057            pos = row["POS"]
7058            ref = row["REF"]
7059            alt = row["ALT"]
7060
7061            # Find list of associated transcripts
7062            transcripts_list = list(
7063                polars_conn.execute(
7064                    f"""
7065                SELECT transcript
7066                FROM refseq_df
7067                WHERE CHROM='{chr}'
7068                AND POS={pos}
7069            """
7070                )["transcript"]
7071            )
7072
7073            # Full HGVS annotation in list
7074            hgvs_full_list = []
7075
7076            for transcript_name in transcripts_list:
7077
7078                # Transcript
7079                transcript = get_transcript(
7080                    transcripts=transcripts, transcript_name=transcript_name
7081                )
7082                # Exon
7083                if use_exon:
7084                    exon = transcript.find_exon_number(pos)
7085                else:
7086                    exon = None
7087                # Protein
7088                transcript_protein = None
7089                if use_protein or add_protein or full_format:
7090                    transcripts_protein = list(
7091                        polars_conn.execute(
7092                            f"""
7093                        SELECT protein
7094                        FROM refseqlink_df
7095                        WHERE transcript='{transcript_name}'
7096                        LIMIT 1
7097                    """
7098                        )["protein"]
7099                    )
7100                    if len(transcripts_protein):
7101                        transcript_protein = transcripts_protein[0]
7102
7103                # HGVS name
7104                hgvs_name = format_hgvs_name(
7105                    chr,
7106                    pos,
7107                    ref,
7108                    alt,
7109                    genome=genome,
7110                    transcript=transcript,
7111                    transcript_protein=transcript_protein,
7112                    exon=exon,
7113                    use_gene=use_gene,
7114                    use_protein=use_protein,
7115                    full_format=full_format,
7116                    use_version=use_version,
7117                    codon_type=codon_type,
7118                )
7119                hgvs_full_list.append(hgvs_name)
7120                if add_protein and not use_protein and not full_format:
7121                    hgvs_name = format_hgvs_name(
7122                        chr,
7123                        pos,
7124                        ref,
7125                        alt,
7126                        genome=genome,
7127                        transcript=transcript,
7128                        transcript_protein=transcript_protein,
7129                        exon=exon,
7130                        use_gene=use_gene,
7131                        use_protein=True,
7132                        full_format=False,
7133                        use_version=use_version,
7134                        codon_type=codon_type,
7135                    )
7136                    hgvs_full_list.append(hgvs_name)
7137
7138            # Create liste of HGVS annotations
7139            hgvs_full = ",".join(hgvs_full_list)
7140
7141            return hgvs_full
7142
7143        # Polars connexion
7144        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7145
7146        # Config
7147        config = self.get_config()
7148
7149        # Databases
7150        # Genome
7151        databases_genomes_folders = (
7152            config.get("folders", {})
7153            .get("databases", {})
7154            .get("genomes", DEFAULT_GENOME_FOLDER)
7155        )
7156        databases_genome = (
7157            config.get("folders", {}).get("databases", {}).get("genomes", "")
7158        )
7159        # refseq database folder
7160        databases_refseq_folders = (
7161            config.get("folders", {})
7162            .get("databases", {})
7163            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7164        )
7165        # refseq
7166        databases_refseq = config.get("databases", {}).get("refSeq", None)
7167        # refSeqLink
7168        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7169
7170        # Param
7171        param = self.get_param()
7172
7173        # Quick HGVS
7174        if "hgvs_options" in param and param.get("hgvs_options", ""):
7175            log.info(f"Quick HGVS Annotation:")
7176            if not param.get("hgvs", None):
7177                param["hgvs"] = {}
7178            for option in param.get("hgvs_options", "").split(","):
7179                option_var_val = option.split("=")
7180                option_var = option_var_val[0]
7181                if len(option_var_val) > 1:
7182                    option_val = option_var_val[1]
7183                else:
7184                    option_val = "True"
7185                if option_val.upper() in ["TRUE"]:
7186                    option_val = True
7187                elif option_val.upper() in ["FALSE"]:
7188                    option_val = False
7189                log.info(f"   {option_var}={option_val}")
7190                param["hgvs"][option_var] = option_val
7191
7192        # Check if HGVS annotation enabled
7193        if "hgvs" in param:
7194            log.info(f"HGVS Annotation... ")
7195            for hgvs_option in param.get("hgvs", {}):
7196                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7197        else:
7198            return
7199
7200        # HGVS Param
7201        param_hgvs = param.get("hgvs", {})
7202        use_exon = param_hgvs.get("use_exon", False)
7203        use_gene = param_hgvs.get("use_gene", False)
7204        use_protein = param_hgvs.get("use_protein", False)
7205        add_protein = param_hgvs.get("add_protein", False)
7206        full_format = param_hgvs.get("full_format", False)
7207        use_version = param_hgvs.get("use_version", False)
7208        codon_type = param_hgvs.get("codon_type", "3")
7209
7210        # refSseq refSeqLink
7211        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7212        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7213
7214        # Assembly
7215        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7216
7217        # Genome
7218        genome_file = None
7219        if find_genome(databases_genome):
7220            genome_file = find_genome(databases_genome)
7221        else:
7222            genome_file = find_genome(
7223                genome_path=databases_genomes_folders, assembly=assembly
7224            )
7225        log.debug("Genome: " + str(genome_file))
7226
7227        # refSseq
7228        refseq_file = find_file_prefix(
7229            input_file=databases_refseq,
7230            prefix="ncbiRefSeq",
7231            folder=databases_refseq_folders,
7232            assembly=assembly,
7233        )
7234        log.debug("refSeq: " + str(refseq_file))
7235
7236        # refSeqLink
7237        refseqlink_file = find_file_prefix(
7238            input_file=databases_refseqlink,
7239            prefix="ncbiRefSeqLink",
7240            folder=databases_refseq_folders,
7241            assembly=assembly,
7242        )
7243        log.debug("refSeqLink: " + str(refseqlink_file))
7244
7245        # Threads
7246        if not threads:
7247            threads = self.get_threads()
7248        log.debug("Threads: " + str(threads))
7249
7250        # Variables
7251        table_variants = self.get_table_variants(clause="update")
7252
7253        # Get variants SNV and InDel only
7254        query_variants = f"""
7255            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7256            FROM {table_variants}
7257            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7258            """
7259        df_variants = self.get_query_to_df(query_variants)
7260
7261        # Added columns
7262        added_columns = []
7263
7264        # Add hgvs column in variants table
7265        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7266        added_column = self.add_column(
7267            table_variants, hgvs_column_name, "STRING", default_value=None
7268        )
7269        added_columns.append(added_column)
7270
7271        log.debug(f"refSeq loading...")
7272        # refSeq in duckDB
7273        refseq_table = get_refseq_table(
7274            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7275        )
7276        # Loading all refSeq in Dataframe
7277        refseq_query = f"""
7278            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7279            FROM {refseq_table}
7280            JOIN df_variants ON (
7281                {refseq_table}.chrom = df_variants.CHROM
7282                AND {refseq_table}.txStart<=df_variants.POS
7283                AND {refseq_table}.txEnd>=df_variants.POS
7284            )
7285        """
7286        refseq_df = self.conn.query(refseq_query).pl()
7287
7288        if refseqlink_file:
7289            log.debug(f"refSeqLink loading...")
7290            # refSeqLink in duckDB
7291            refseqlink_table = get_refseq_table(
7292                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7293            )
7294            # Loading all refSeqLink in Dataframe
7295            protacc_column = "protAcc_with_ver"
7296            mrnaacc_column = "mrnaAcc_with_ver"
7297            refseqlink_query = f"""
7298                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7299                FROM {refseqlink_table} 
7300                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7301                WHERE protAcc_without_ver IS NOT NULL
7302            """
7303            # Polars Dataframe
7304            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7305
7306        # Read RefSeq transcripts into a python dict/model.
7307        log.debug(f"Transcripts loading...")
7308        with tempfile.TemporaryDirectory() as tmpdir:
7309            transcripts_query = f"""
7310                COPY (
7311                    SELECT {refseq_table}.*
7312                    FROM {refseq_table}
7313                    JOIN df_variants ON (
7314                        {refseq_table}.chrom=df_variants.CHROM
7315                        AND {refseq_table}.txStart<=df_variants.POS
7316                        AND {refseq_table}.txEnd>=df_variants.POS
7317                    )
7318                )
7319                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7320            """
7321            self.conn.query(transcripts_query)
7322            with open(f"{tmpdir}/transcript.tsv") as infile:
7323                transcripts = read_transcripts(infile)
7324
7325        # Polars connexion
7326        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7327
7328        log.debug("Genome loading...")
7329        # Read genome sequence using pyfaidx.
7330        genome = Fasta(genome_file)
7331
7332        log.debug("Start annotation HGVS...")
7333
7334        # Create
7335        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7336        ddf = dd.from_pandas(df_variants, npartitions=threads)
7337
7338        # Use dask.dataframe.apply() to apply function on each partition
7339        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7340
7341        # Convert Dask DataFrame to Pandas Dataframe
7342        df = ddf.compute()
7343
7344        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7345        with tempfile.TemporaryDirectory() as tmpdir:
7346            df_parquet = os.path.join(tmpdir, "df.parquet")
7347            df.to_parquet(df_parquet)
7348
7349            # Update hgvs column
7350            update_variant_query = f"""
7351                UPDATE {table_variants}
7352                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7353                FROM read_parquet('{df_parquet}') as df
7354                WHERE variants."#CHROM" = df.CHROM
7355                AND variants.POS = df.POS
7356                AND variants.REF = df.REF
7357                AND variants.ALT = df.ALT
7358                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7359                """
7360            self.execute_query(update_variant_query)
7361
7362        # Update INFO column
7363        sql_query_update = f"""
7364            UPDATE {table_variants}
7365            SET INFO = 
7366                concat(
7367                    CASE 
7368                        WHEN INFO NOT IN ('','.')
7369                        THEN concat(INFO, ';')
7370                        ELSE ''
7371                    END,
7372                    'hgvs=',
7373                    {hgvs_column_name}
7374                )
7375            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7376            """
7377        self.execute_query(sql_query_update)
7378
7379        # Add header
7380        HGVS_INFOS = {
7381            "hgvs": {
7382                "ID": "hgvs",
7383                "Number": ".",
7384                "Type": "String",
7385                "Description": f"HGVS annotatation with HOWARD",
7386            }
7387        }
7388
7389        for field in HGVS_INFOS:
7390            field_ID = HGVS_INFOS[field]["ID"]
7391            field_description = HGVS_INFOS[field]["Description"]
7392            self.get_header().infos[field_ID] = vcf.parser._Info(
7393                field_ID,
7394                HGVS_INFOS[field]["Number"],
7395                HGVS_INFOS[field]["Type"],
7396                field_description,
7397                "unknown",
7398                "unknown",
7399                code_type_map[HGVS_INFOS[field]["Type"]],
7400            )
7401
7402        # Remove added columns
7403        for added_column in added_columns:
7404            self.drop_column(column=added_column)
7405
7406    ###
7407    # Calculation
7408    ###
7409
7410    def get_operations_help(
7411        self, operations_config_dict: dict = {}, operations_config_file: str = None
7412    ) -> list:
7413
7414        # Init
7415        operations_help = []
7416
7417        # operations
7418        operations = self.get_config_json(
7419            name="calculations",
7420            config_dict=operations_config_dict,
7421            config_file=operations_config_file,
7422        )
7423        for op in operations:
7424            op_name = operations[op].get("name", op).upper()
7425            op_description = operations[op].get("description", op_name)
7426            op_available = operations[op].get("available", False)
7427            if op_available:
7428                operations_help.append(f"   {op_name}: {op_description}")
7429
7430        # Sort operations
7431        operations_help.sort()
7432
7433        # insert header
7434        operations_help.insert(0, "Available calculation operations:")
7435
7436        # Return
7437        return operations_help
7438
7439    def calculation(
7440        self,
7441        operations: dict = {},
7442        operations_config_dict: dict = {},
7443        operations_config_file: str = None,
7444    ) -> None:
7445        """
7446        It takes a list of operations, and for each operation, it checks if it's a python or sql
7447        operation, and then calls the appropriate function
7448
7449        param json example:
7450            "calculation": {
7451                "NOMEN": {
7452                    "options": {
7453                        "hgvs_field": "hgvs"
7454                    },
7455                "middle" : null
7456            }
7457        """
7458
7459        # Param
7460        param = self.get_param()
7461
7462        # operations config
7463        operations_config = self.get_config_json(
7464            name="calculations",
7465            config_dict=operations_config_dict,
7466            config_file=operations_config_file,
7467        )
7468
7469        # Upper keys
7470        operations_config = {k.upper(): v for k, v in operations_config.items()}
7471
7472        # Calculations
7473
7474        # Operations from param
7475        operations = param.get("calculation", {}).get("calculations", operations)
7476
7477        # Quick calculation - add
7478        if param.get("calculations", None):
7479            calculations_list = [
7480                value for value in param.get("calculations", "").split(",")
7481            ]
7482            log.info(f"Quick Calculations:")
7483            for calculation_key in calculations_list:
7484                log.info(f"   {calculation_key}")
7485            for calculation_operation in calculations_list:
7486                if calculation_operation.upper() not in operations:
7487                    operations[calculation_operation.upper()] = {}
7488                    add_value_into_dict(
7489                        dict_tree=param,
7490                        sections=[
7491                            "calculation",
7492                            "calculations",
7493                            calculation_operation.upper(),
7494                        ],
7495                        value={},
7496                    )
7497
7498        # Operations for calculation
7499        if not operations:
7500            operations = param.get("calculation", {}).get("calculations", {})
7501
7502        if operations:
7503            log.info(f"Calculations...")
7504
7505        # For each operations
7506        for operation_name in operations:
7507            operation_name = operation_name.upper()
7508            if operation_name not in [""]:
7509                if operation_name in operations_config:
7510                    log.info(f"Calculation '{operation_name}'")
7511                    operation = operations_config[operation_name]
7512                    operation_type = operation.get("type", "sql")
7513                    if operation_type == "python":
7514                        self.calculation_process_function(
7515                            operation=operation, operation_name=operation_name
7516                        )
7517                    elif operation_type == "sql":
7518                        self.calculation_process_sql(
7519                            operation=operation, operation_name=operation_name
7520                        )
7521                    else:
7522                        log.error(
7523                            f"Operations config: Type '{operation_type}' NOT available"
7524                        )
7525                        raise ValueError(
7526                            f"Operations config: Type '{operation_type}' NOT available"
7527                        )
7528                else:
7529                    log.error(
7530                        f"Operations config: Calculation '{operation_name}' NOT available"
7531                    )
7532                    raise ValueError(
7533                        f"Operations config: Calculation '{operation_name}' NOT available"
7534                    )
7535
7536        # Explode INFOS fields into table fields
7537        if self.get_explode_infos():
7538            self.explode_infos(
7539                prefix=self.get_explode_infos_prefix(),
7540                fields=self.get_explode_infos_fields(),
7541                force=True,
7542            )
7543
7544    def calculation_process_sql(
7545        self, operation: dict, operation_name: str = "unknown"
7546    ) -> None:
7547        """
7548        The `calculation_process_sql` function takes in a mathematical operation as a string and
7549        performs the operation, updating the specified table with the result.
7550
7551        :param operation: The `operation` parameter is a dictionary that contains information about the
7552        mathematical operation to be performed. It includes the following keys:
7553        :type operation: dict
7554        :param operation_name: The `operation_name` parameter is a string that represents the name of
7555        the mathematical operation being performed. It is used for logging and error handling purposes,
7556        defaults to unknown
7557        :type operation_name: str (optional)
7558        """
7559
7560        # table variants
7561        table_variants = self.get_table_variants(clause="alter")
7562
7563        # Operation infos
7564        operation_name = operation.get("name", "unknown")
7565        log.debug(f"process sql {operation_name}")
7566        output_column_name = operation.get("output_column_name", operation_name)
7567        output_column_type = operation.get("output_column_type", "String")
7568        prefix = operation.get("explode_infos_prefix", "")
7569        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7570        output_column_description = operation.get(
7571            "output_column_description", f"{operation_name} operation"
7572        )
7573        operation_query = operation.get("operation_query", None)
7574        if isinstance(operation_query, list):
7575            operation_query = " ".join(operation_query)
7576        operation_info_fields = operation.get("info_fields", [])
7577        operation_info_fields_check = operation.get("info_fields_check", False)
7578        operation_info = operation.get("operation_info", True)
7579
7580        if operation_query:
7581
7582            # Info fields check
7583            operation_info_fields_check_result = True
7584            if operation_info_fields_check:
7585                header_infos = self.get_header().infos
7586                for info_field in operation_info_fields:
7587                    operation_info_fields_check_result = (
7588                        operation_info_fields_check_result
7589                        and info_field in header_infos
7590                    )
7591
7592            # If info fields available
7593            if operation_info_fields_check_result:
7594
7595                # Added_columns
7596                added_columns = []
7597
7598                # Create VCF header field
7599                vcf_reader = self.get_header()
7600                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7601                    output_column_name,
7602                    ".",
7603                    output_column_type,
7604                    output_column_description,
7605                    "howard calculation",
7606                    "0",
7607                    self.code_type_map.get(output_column_type),
7608                )
7609
7610                # Explode infos if needed
7611                log.debug(f"calculation_process_sql prefix {prefix}")
7612                added_columns += self.explode_infos(
7613                    prefix=prefix,
7614                    fields=[output_column_name] + operation_info_fields,
7615                    force=True,
7616                )
7617
7618                # Create column
7619                added_column = self.add_column(
7620                    table_name=table_variants,
7621                    column_name=prefix + output_column_name,
7622                    column_type=output_column_type_sql,
7623                    default_value="null",
7624                )
7625                added_columns.append(added_column)
7626
7627                # Operation calculation
7628                try:
7629
7630                    # Query to update calculation column
7631                    sql_update = f"""
7632                        UPDATE {table_variants}
7633                        SET "{prefix}{output_column_name}" = ({operation_query})
7634                    """
7635                    self.conn.execute(sql_update)
7636
7637                    # Add to INFO
7638                    if operation_info:
7639                        sql_update_info = f"""
7640                            UPDATE {table_variants}
7641                            SET "INFO" =
7642                                concat(
7643                                    CASE
7644                                        WHEN "INFO" IS NOT NULL
7645                                        THEN concat("INFO", ';')
7646                                        ELSE ''
7647                                    END,
7648                                    '{output_column_name}=',
7649                                    "{prefix}{output_column_name}"
7650                                )
7651                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7652                        """
7653                        self.conn.execute(sql_update_info)
7654
7655                except:
7656                    log.error(
7657                        f"Operations config: Calculation '{operation_name}' query failed"
7658                    )
7659                    raise ValueError(
7660                        f"Operations config: Calculation '{operation_name}' query failed"
7661                    )
7662
7663                # Remove added columns
7664                for added_column in added_columns:
7665                    log.debug(f"added_column: {added_column}")
7666                    self.drop_column(column=added_column)
7667
7668            else:
7669                log.error(
7670                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7671                )
7672                raise ValueError(
7673                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7674                )
7675
7676        else:
7677            log.error(
7678                f"Operations config: Calculation '{operation_name}' query NOT defined"
7679            )
7680            raise ValueError(
7681                f"Operations config: Calculation '{operation_name}' query NOT defined"
7682            )
7683
7684    def calculation_process_function(
7685        self, operation: dict, operation_name: str = "unknown"
7686    ) -> None:
7687        """
7688        The `calculation_process_function` takes in an operation dictionary and performs the specified
7689        function with the given parameters.
7690
7691        :param operation: The `operation` parameter is a dictionary that contains information about the
7692        operation to be performed. It has the following keys:
7693        :type operation: dict
7694        :param operation_name: The `operation_name` parameter is a string that represents the name of
7695        the operation being performed. It is used for logging purposes, defaults to unknown
7696        :type operation_name: str (optional)
7697        """
7698
7699        operation_name = operation["name"]
7700        log.debug(f"process sql {operation_name}")
7701        function_name = operation["function_name"]
7702        function_params = operation["function_params"]
7703        getattr(self, function_name)(*function_params)
7704
7705    def calculation_variant_id(self) -> None:
7706        """
7707        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7708        updates the INFO field of a variants table with the variant ID.
7709        """
7710
7711        # variant_id annotation field
7712        variant_id_tag = self.get_variant_id_column()
7713        added_columns = [variant_id_tag]
7714
7715        # variant_id hgvs tags"
7716        vcf_infos_tags = {
7717            variant_id_tag: "howard variant ID annotation",
7718        }
7719
7720        # Variants table
7721        table_variants = self.get_table_variants()
7722
7723        # Header
7724        vcf_reader = self.get_header()
7725
7726        # Add variant_id to header
7727        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7728            variant_id_tag,
7729            ".",
7730            "String",
7731            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7732            "howard calculation",
7733            "0",
7734            self.code_type_map.get("String"),
7735        )
7736
7737        # Update
7738        sql_update = f"""
7739            UPDATE {table_variants}
7740            SET "INFO" = 
7741                concat(
7742                    CASE
7743                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7744                        THEN ''
7745                        ELSE concat("INFO", ';')
7746                    END,
7747                    '{variant_id_tag}=',
7748                    "{variant_id_tag}"
7749                )
7750        """
7751        self.conn.execute(sql_update)
7752
7753        # Remove added columns
7754        for added_column in added_columns:
7755            self.drop_column(column=added_column)
7756
7757    def calculation_extract_snpeff_hgvs(self) -> None:
7758        """
7759        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7760        annotation field in a VCF file and adds them as a new column in the variants table.
7761        """
7762
7763        # SnpEff annotation field
7764        snpeff_ann = "ANN"
7765
7766        # SnpEff annotation field
7767        snpeff_hgvs = "snpeff_hgvs"
7768
7769        # Snpeff hgvs tags
7770        vcf_infos_tags = {
7771            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7772        }
7773
7774        # Prefix
7775        prefix = self.get_explode_infos_prefix()
7776        if prefix:
7777            prefix = "INFO/"
7778
7779        # snpEff fields
7780        speff_ann_infos = prefix + snpeff_ann
7781        speff_hgvs_infos = prefix + snpeff_hgvs
7782
7783        # Variants table
7784        table_variants = self.get_table_variants()
7785
7786        # Header
7787        vcf_reader = self.get_header()
7788
7789        # Add columns
7790        added_columns = []
7791
7792        # Explode HGVS field in column
7793        added_columns += self.explode_infos(fields=[snpeff_ann])
7794
7795        if "ANN" in vcf_reader.infos:
7796
7797            log.debug(vcf_reader.infos["ANN"])
7798
7799            # Create variant id
7800            variant_id_column = self.get_variant_id_column()
7801            added_columns += [variant_id_column]
7802
7803            # Create dataframe
7804            dataframe_snpeff_hgvs = self.get_query_to_df(
7805                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
7806            )
7807
7808            # Create main NOMEN column
7809            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
7810                speff_ann_infos
7811            ].apply(lambda x: extract_snpeff_hgvs(str(x)))
7812
7813            # Add snpeff_hgvs to header
7814            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
7815                snpeff_hgvs,
7816                ".",
7817                "String",
7818                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
7819                "howard calculation",
7820                "0",
7821                self.code_type_map.get("String"),
7822            )
7823
7824            # Update
7825            sql_update = f"""
7826                UPDATE variants
7827                SET "INFO" = 
7828                    concat(
7829                        CASE
7830                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7831                            THEN ''
7832                            ELSE concat("INFO", ';')
7833                        END,
7834                        CASE 
7835                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
7836                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
7837                            THEN concat(
7838                                    '{snpeff_hgvs}=',
7839                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
7840                                )
7841                            ELSE ''
7842                        END
7843                    )
7844                FROM dataframe_snpeff_hgvs
7845                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
7846
7847            """
7848            self.conn.execute(sql_update)
7849
7850            # Delete dataframe
7851            del dataframe_snpeff_hgvs
7852            gc.collect()
7853
7854        else:
7855
7856            log.warning(
7857                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
7858            )
7859
7860        # Remove added columns
7861        for added_column in added_columns:
7862            self.drop_column(column=added_column)
7863
7864    def calculation_extract_nomen(self) -> None:
7865        """
7866        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
7867        """
7868
7869        # NOMEN field
7870        field_nomen_dict = "NOMEN_DICT"
7871
7872        # NOMEN structure
7873        nomen_dict = {
7874            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
7875            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
7876            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
7877            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
7878            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
7879            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
7880            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
7881            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
7882            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
7883            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
7884        }
7885
7886        # Param
7887        param = self.get_param()
7888
7889        # Prefix
7890        prefix = self.get_explode_infos_prefix()
7891
7892        # Header
7893        vcf_reader = self.get_header()
7894
7895        # Get HGVS field
7896        hgvs_field = (
7897            param.get("calculation", {})
7898            .get("calculations", {})
7899            .get("NOMEN", {})
7900            .get("options", {})
7901            .get("hgvs_field", "hgvs")
7902        )
7903
7904        # Get transcripts
7905        transcripts_file = (
7906            param.get("calculation", {})
7907            .get("calculations", {})
7908            .get("NOMEN", {})
7909            .get("options", {})
7910            .get("transcripts", None)
7911        )
7912        transcripts_file = full_path(transcripts_file)
7913        transcripts = []
7914        if transcripts_file:
7915            if os.path.exists(transcripts_file):
7916                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
7917                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
7918            else:
7919                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
7920                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
7921
7922        # Added columns
7923        added_columns = []
7924
7925        # Explode HGVS field in column
7926        added_columns += self.explode_infos(fields=[hgvs_field])
7927
7928        # extra infos
7929        extra_infos = self.get_extra_infos()
7930        extra_field = prefix + hgvs_field
7931
7932        if extra_field in extra_infos:
7933
7934            # Create dataframe
7935            dataframe_hgvs = self.get_query_to_df(
7936                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
7937            )
7938
7939            # Create main NOMEN column
7940            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
7941                lambda x: find_nomen(str(x), transcripts=transcripts)
7942            )
7943
7944            # Explode NOMEN Structure and create SQL set for update
7945            sql_nomen_fields = []
7946            for nomen_field in nomen_dict:
7947
7948                # Explode each field into a column
7949                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
7950                    lambda x: dict(x).get(nomen_field, "")
7951                )
7952
7953                # Create VCF header field
7954                vcf_reader.infos[nomen_field] = vcf.parser._Info(
7955                    nomen_field,
7956                    ".",
7957                    "String",
7958                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
7959                    "howard calculation",
7960                    "0",
7961                    self.code_type_map.get("String"),
7962                )
7963                sql_nomen_fields.append(
7964                    f"""
7965                        CASE 
7966                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
7967                            THEN concat(
7968                                    ';{nomen_field}=',
7969                                    dataframe_hgvs."{nomen_field}"
7970                                )
7971                            ELSE ''
7972                        END
7973                    """
7974                )
7975
7976            # SQL set for update
7977            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
7978
7979            # Update
7980            sql_update = f"""
7981                UPDATE variants
7982                SET "INFO" = 
7983                    concat(
7984                        CASE
7985                            WHEN "INFO" IS NULL
7986                            THEN ''
7987                            ELSE "INFO"
7988                        END,
7989                        {sql_nomen_fields_set}
7990                    )
7991                FROM dataframe_hgvs
7992                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
7993                    AND variants."POS" = dataframe_hgvs."POS" 
7994                    AND variants."REF" = dataframe_hgvs."REF"
7995                    AND variants."ALT" = dataframe_hgvs."ALT"
7996            """
7997            self.conn.execute(sql_update)
7998
7999            # Delete dataframe
8000            del dataframe_hgvs
8001            gc.collect()
8002
8003        # Remove added columns
8004        for added_column in added_columns:
8005            self.drop_column(column=added_column)
8006
8007    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8008        """
8009        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8010        pipeline/sample for a variant and updates the variant information in a VCF file.
8011
8012        :param tag: The `tag` parameter is a string that represents the annotation field for the
8013        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8014        VCF header and to update the corresponding field in the variants table, defaults to
8015        findbypipeline
8016        :type tag: str (optional)
8017        """
8018
8019        # if FORMAT and samples
8020        if (
8021            "FORMAT" in self.get_header_columns_as_list()
8022            and self.get_header_sample_list()
8023        ):
8024
8025            # findbypipeline annotation field
8026            findbypipeline_tag = tag
8027
8028            # VCF infos tags
8029            vcf_infos_tags = {
8030                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8031            }
8032
8033            # Prefix
8034            prefix = self.get_explode_infos_prefix()
8035
8036            # Field
8037            findbypipeline_infos = prefix + findbypipeline_tag
8038
8039            # Variants table
8040            table_variants = self.get_table_variants()
8041
8042            # Header
8043            vcf_reader = self.get_header()
8044
8045            # Create variant id
8046            variant_id_column = self.get_variant_id_column()
8047            added_columns = [variant_id_column]
8048
8049            # variant_id, FORMAT and samples
8050            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8051                self.get_header_sample_list()
8052            )
8053
8054            # Create dataframe
8055            dataframe_findbypipeline = self.get_query_to_df(
8056                f""" SELECT {samples_fields} FROM {table_variants} """
8057            )
8058
8059            # Create findbypipeline column
8060            dataframe_findbypipeline[findbypipeline_infos] = (
8061                dataframe_findbypipeline.apply(
8062                    lambda row: findbypipeline(
8063                        row, samples=self.get_header_sample_list()
8064                    ),
8065                    axis=1,
8066                )
8067            )
8068
8069            # Add snpeff_hgvs to header
8070            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8071                findbypipeline_tag,
8072                ".",
8073                "String",
8074                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8075                "howard calculation",
8076                "0",
8077                self.code_type_map.get("String"),
8078            )
8079
8080            # Update
8081            sql_update = f"""
8082                UPDATE variants
8083                SET "INFO" = 
8084                    concat(
8085                        CASE
8086                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8087                            THEN ''
8088                            ELSE concat("INFO", ';')
8089                        END,
8090                        CASE 
8091                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8092                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8093                            THEN concat(
8094                                    '{findbypipeline_tag}=',
8095                                    dataframe_findbypipeline."{findbypipeline_infos}"
8096                                )
8097                            ELSE ''
8098                        END
8099                    )
8100                FROM dataframe_findbypipeline
8101                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8102            """
8103            self.conn.execute(sql_update)
8104
8105            # Remove added columns
8106            for added_column in added_columns:
8107                self.drop_column(column=added_column)
8108
8109            # Delete dataframe
8110            del dataframe_findbypipeline
8111            gc.collect()
8112
8113    def calculation_genotype_concordance(self) -> None:
8114        """
8115        The function `calculation_genotype_concordance` calculates the genotype concordance for
8116        multi-caller VCF files and updates the variant information in the database.
8117        """
8118
8119        # if FORMAT and samples
8120        if (
8121            "FORMAT" in self.get_header_columns_as_list()
8122            and self.get_header_sample_list()
8123        ):
8124
8125            # genotypeconcordance annotation field
8126            genotypeconcordance_tag = "genotypeconcordance"
8127
8128            # VCF infos tags
8129            vcf_infos_tags = {
8130                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8131            }
8132
8133            # Prefix
8134            prefix = self.get_explode_infos_prefix()
8135
8136            # Field
8137            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8138
8139            # Variants table
8140            table_variants = self.get_table_variants()
8141
8142            # Header
8143            vcf_reader = self.get_header()
8144
8145            # Create variant id
8146            variant_id_column = self.get_variant_id_column()
8147            added_columns = [variant_id_column]
8148
8149            # variant_id, FORMAT and samples
8150            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8151                self.get_header_sample_list()
8152            )
8153
8154            # Create dataframe
8155            dataframe_genotypeconcordance = self.get_query_to_df(
8156                f""" SELECT {samples_fields} FROM {table_variants} """
8157            )
8158
8159            # Create genotypeconcordance column
8160            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8161                dataframe_genotypeconcordance.apply(
8162                    lambda row: genotypeconcordance(
8163                        row, samples=self.get_header_sample_list()
8164                    ),
8165                    axis=1,
8166                )
8167            )
8168
8169            # Add genotypeconcordance to header
8170            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8171                genotypeconcordance_tag,
8172                ".",
8173                "String",
8174                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8175                "howard calculation",
8176                "0",
8177                self.code_type_map.get("String"),
8178            )
8179
8180            # Update
8181            sql_update = f"""
8182                UPDATE variants
8183                SET "INFO" = 
8184                    concat(
8185                        CASE
8186                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8187                            THEN ''
8188                            ELSE concat("INFO", ';')
8189                        END,
8190                        CASE
8191                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8192                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8193                            THEN concat(
8194                                    '{genotypeconcordance_tag}=',
8195                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8196                                )
8197                            ELSE ''
8198                        END
8199                    )
8200                FROM dataframe_genotypeconcordance
8201                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8202            """
8203            self.conn.execute(sql_update)
8204
8205            # Remove added columns
8206            for added_column in added_columns:
8207                self.drop_column(column=added_column)
8208
8209            # Delete dataframe
8210            del dataframe_genotypeconcordance
8211            gc.collect()
8212
8213    def calculation_barcode(self, tag: str = "barcode") -> None:
8214        """
8215        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8216        updates the INFO field in the file with the calculated barcode values.
8217        """
8218
8219        # if FORMAT and samples
8220        if (
8221            "FORMAT" in self.get_header_columns_as_list()
8222            and self.get_header_sample_list()
8223        ):
8224
8225            # barcode annotation field
8226            if not tag:
8227                tag = "barcode"
8228
8229            # VCF infos tags
8230            vcf_infos_tags = {
8231                tag: "barcode calculation (VaRank)",
8232            }
8233
8234            # Prefix
8235            prefix = self.get_explode_infos_prefix()
8236
8237            # Field
8238            barcode_infos = prefix + tag
8239
8240            # Variants table
8241            table_variants = self.get_table_variants()
8242
8243            # Header
8244            vcf_reader = self.get_header()
8245
8246            # Create variant id
8247            variant_id_column = self.get_variant_id_column()
8248            added_columns = [variant_id_column]
8249
8250            # variant_id, FORMAT and samples
8251            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8252                self.get_header_sample_list()
8253            )
8254
8255            # Create dataframe
8256            dataframe_barcode = self.get_query_to_df(
8257                f""" SELECT {samples_fields} FROM {table_variants} """
8258            )
8259
8260            # Create barcode column
8261            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8262                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8263            )
8264
8265            # Add barcode to header
8266            vcf_reader.infos[tag] = vcf.parser._Info(
8267                tag,
8268                ".",
8269                "String",
8270                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8271                "howard calculation",
8272                "0",
8273                self.code_type_map.get("String"),
8274            )
8275
8276            # Update
8277            sql_update = f"""
8278                UPDATE {table_variants}
8279                SET "INFO" = 
8280                    concat(
8281                        CASE
8282                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8283                            THEN ''
8284                            ELSE concat("INFO", ';')
8285                        END,
8286                        CASE
8287                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8288                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8289                            THEN concat(
8290                                    '{tag}=',
8291                                    dataframe_barcode."{barcode_infos}"
8292                                )
8293                            ELSE ''
8294                        END
8295                    )
8296                FROM dataframe_barcode
8297                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8298            """
8299            self.conn.execute(sql_update)
8300
8301            # Remove added columns
8302            for added_column in added_columns:
8303                self.drop_column(column=added_column)
8304
8305            # Delete dataframe
8306            del dataframe_barcode
8307            gc.collect()
8308
8309    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8310        """
8311        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8312        and updates the INFO field in the file with the calculated barcode values.
8313
8314        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8315        the barcode tag that will be added to the VCF file during the calculation process. If no value
8316        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8317        :type tag: str (optional)
8318        """
8319
8320        # if FORMAT and samples
8321        if (
8322            "FORMAT" in self.get_header_columns_as_list()
8323            and self.get_header_sample_list()
8324        ):
8325
8326            # barcode annotation field
8327            if not tag:
8328                tag = "BCF"
8329
8330            # VCF infos tags
8331            vcf_infos_tags = {
8332                tag: "barcode family calculation",
8333                f"{tag}S": "barcode family samples",
8334            }
8335
8336            # Param
8337            param = self.get_param()
8338            log.debug(f"param={param}")
8339
8340            # Prefix
8341            prefix = self.get_explode_infos_prefix()
8342
8343            # PED param
8344            ped = (
8345                param.get("calculation", {})
8346                .get("calculations", {})
8347                .get("BARCODEFAMILY", {})
8348                .get("family_pedigree", None)
8349            )
8350            log.debug(f"ped={ped}")
8351
8352            # Load PED
8353            if ped:
8354
8355                # Pedigree is a file
8356                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8357                    log.debug("Pedigree is file")
8358                    with open(full_path(ped)) as ped:
8359                        ped = json.load(ped)
8360
8361                # Pedigree is a string
8362                elif isinstance(ped, str):
8363                    log.debug("Pedigree is str")
8364                    try:
8365                        ped = json.loads(ped)
8366                        log.debug("Pedigree is json str")
8367                    except ValueError as e:
8368                        ped_samples = ped.split(",")
8369                        ped = {}
8370                        for ped_sample in ped_samples:
8371                            ped[ped_sample] = ped_sample
8372
8373                # Pedigree is a dict
8374                elif isinstance(ped, dict):
8375                    log.debug("Pedigree is dict")
8376
8377                # Pedigree is not well formatted
8378                else:
8379                    msg_error = "Pedigree not well formatted"
8380                    log.error(msg_error)
8381                    raise ValueError(msg_error)
8382
8383                # Construct list
8384                ped_samples = list(ped.values())
8385
8386            else:
8387                log.debug("Pedigree not defined. Take all samples")
8388                ped_samples = self.get_header_sample_list()
8389                ped = {}
8390                for ped_sample in ped_samples:
8391                    ped[ped_sample] = ped_sample
8392
8393            # Check pedigree
8394            if not ped or len(ped) == 0:
8395                msg_error = f"Error in pedigree: samples {ped_samples}"
8396                log.error(msg_error)
8397                raise ValueError(msg_error)
8398
8399            # Log
8400            log.info(
8401                "Calculation 'BARCODEFAMILY' - Samples: "
8402                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8403            )
8404            log.debug(f"ped_samples={ped_samples}")
8405
8406            # Field
8407            barcode_infos = prefix + tag
8408
8409            # Variants table
8410            table_variants = self.get_table_variants()
8411
8412            # Header
8413            vcf_reader = self.get_header()
8414
8415            # Create variant id
8416            variant_id_column = self.get_variant_id_column()
8417            added_columns = [variant_id_column]
8418
8419            # variant_id, FORMAT and samples
8420            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8421                ped_samples
8422            )
8423
8424            # Create dataframe
8425            dataframe_barcode = self.get_query_to_df(
8426                f""" SELECT {samples_fields} FROM {table_variants} """
8427            )
8428
8429            # Create barcode column
8430            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8431                lambda row: barcode(row, samples=ped_samples), axis=1
8432            )
8433
8434            # Add barcode family to header
8435            # Add vaf_normalization to header
8436            vcf_reader.formats[tag] = vcf.parser._Format(
8437                id=tag,
8438                num=".",
8439                type="String",
8440                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8441                type_code=self.code_type_map.get("String"),
8442            )
8443            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8444                id=f"{tag}S",
8445                num=".",
8446                type="String",
8447                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8448                type_code=self.code_type_map.get("String"),
8449            )
8450
8451            # Update
8452            # for sample in ped_samples:
8453            sql_update_set = []
8454            for sample in self.get_header_sample_list() + ["FORMAT"]:
8455                if sample in ped_samples:
8456                    value = f'dataframe_barcode."{barcode_infos}"'
8457                    value_samples = "'" + ",".join(ped_samples) + "'"
8458                elif sample == "FORMAT":
8459                    value = f"'{tag}'"
8460                    value_samples = f"'{tag}S'"
8461                else:
8462                    value = "'.'"
8463                    value_samples = "'.'"
8464                format_regex = r"[a-zA-Z0-9\s]"
8465                sql_update_set.append(
8466                    f"""
8467                        "{sample}" = 
8468                        concat(
8469                            CASE
8470                                WHEN {table_variants}."{sample}" = './.'
8471                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8472                                ELSE {table_variants}."{sample}"
8473                            END,
8474                            ':',
8475                            {value},
8476                            ':',
8477                            {value_samples}
8478                        )
8479                    """
8480                )
8481
8482            sql_update_set_join = ", ".join(sql_update_set)
8483            sql_update = f"""
8484                UPDATE {table_variants}
8485                SET {sql_update_set_join}
8486                FROM dataframe_barcode
8487                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8488            """
8489            self.conn.execute(sql_update)
8490
8491            # Remove added columns
8492            for added_column in added_columns:
8493                self.drop_column(column=added_column)
8494
8495            # Delete dataframe
8496            del dataframe_barcode
8497            gc.collect()
8498
8499    def calculation_trio(self) -> None:
8500        """
8501        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8502        information to the INFO field of each variant.
8503        """
8504
8505        # if FORMAT and samples
8506        if (
8507            "FORMAT" in self.get_header_columns_as_list()
8508            and self.get_header_sample_list()
8509        ):
8510
8511            # trio annotation field
8512            trio_tag = "trio"
8513
8514            # VCF infos tags
8515            vcf_infos_tags = {
8516                "trio": "trio calculation",
8517            }
8518
8519            # Param
8520            param = self.get_param()
8521
8522            # Prefix
8523            prefix = self.get_explode_infos_prefix()
8524
8525            # Trio param
8526            trio_ped = (
8527                param.get("calculation", {})
8528                .get("calculations", {})
8529                .get("TRIO", {})
8530                .get("trio_pedigree", None)
8531            )
8532
8533            # Load trio
8534            if trio_ped:
8535
8536                # Trio pedigree is a file
8537                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8538                    log.debug("TRIO pedigree is file")
8539                    with open(full_path(trio_ped)) as trio_ped:
8540                        trio_ped = json.load(trio_ped)
8541
8542                # Trio pedigree is a string
8543                elif isinstance(trio_ped, str):
8544                    log.debug("TRIO pedigree is str")
8545                    try:
8546                        trio_ped = json.loads(trio_ped)
8547                        log.debug("TRIO pedigree is json str")
8548                    except ValueError as e:
8549                        trio_samples = trio_ped.split(",")
8550                        if len(trio_samples) == 3:
8551                            trio_ped = {
8552                                "father": trio_samples[0],
8553                                "mother": trio_samples[1],
8554                                "child": trio_samples[2],
8555                            }
8556                            log.debug("TRIO pedigree is list str")
8557                        else:
8558                            msg_error = "TRIO pedigree not well formatted"
8559                            log.error(msg_error)
8560                            raise ValueError(msg_error)
8561
8562                # Trio pedigree is a dict
8563                elif isinstance(trio_ped, dict):
8564                    log.debug("TRIO pedigree is dict")
8565
8566                # Trio pedigree is not well formatted
8567                else:
8568                    msg_error = "TRIO pedigree not well formatted"
8569                    log.error(msg_error)
8570                    raise ValueError(msg_error)
8571
8572                # Construct trio list
8573                trio_samples = [
8574                    trio_ped.get("father", ""),
8575                    trio_ped.get("mother", ""),
8576                    trio_ped.get("child", ""),
8577                ]
8578
8579            else:
8580                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8581                samples_list = self.get_header_sample_list()
8582                if len(samples_list) >= 3:
8583                    trio_samples = self.get_header_sample_list()[0:3]
8584                    trio_ped = {
8585                        "father": trio_samples[0],
8586                        "mother": trio_samples[1],
8587                        "child": trio_samples[2],
8588                    }
8589                else:
8590                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8591                    log.error(msg_error)
8592                    raise ValueError(msg_error)
8593
8594            # Check trio pedigree
8595            if not trio_ped or len(trio_ped) != 3:
8596                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8597                log.error(msg_error)
8598                raise ValueError(msg_error)
8599
8600            # Log
8601            log.info(
8602                f"Calculation 'TRIO' - Samples: "
8603                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8604            )
8605
8606            # Field
8607            trio_infos = prefix + trio_tag
8608
8609            # Variants table
8610            table_variants = self.get_table_variants()
8611
8612            # Header
8613            vcf_reader = self.get_header()
8614
8615            # Create variant id
8616            variant_id_column = self.get_variant_id_column()
8617            added_columns = [variant_id_column]
8618
8619            # variant_id, FORMAT and samples
8620            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8621                self.get_header_sample_list()
8622            )
8623
8624            # Create dataframe
8625            dataframe_trio = self.get_query_to_df(
8626                f""" SELECT {samples_fields} FROM {table_variants} """
8627            )
8628
8629            # Create trio column
8630            dataframe_trio[trio_infos] = dataframe_trio.apply(
8631                lambda row: trio(row, samples=trio_samples), axis=1
8632            )
8633
8634            # Add trio to header
8635            vcf_reader.infos[trio_tag] = vcf.parser._Info(
8636                trio_tag,
8637                ".",
8638                "String",
8639                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
8640                "howard calculation",
8641                "0",
8642                self.code_type_map.get("String"),
8643            )
8644
8645            # Update
8646            sql_update = f"""
8647                UPDATE {table_variants}
8648                SET "INFO" = 
8649                    concat(
8650                        CASE
8651                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8652                            THEN ''
8653                            ELSE concat("INFO", ';')
8654                        END,
8655                        CASE
8656                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
8657                             AND dataframe_trio."{trio_infos}" NOT NULL
8658                            THEN concat(
8659                                    '{trio_tag}=',
8660                                    dataframe_trio."{trio_infos}"
8661                                )
8662                            ELSE ''
8663                        END
8664                    )
8665                FROM dataframe_trio
8666                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
8667            """
8668            self.conn.execute(sql_update)
8669
8670            # Remove added columns
8671            for added_column in added_columns:
8672                self.drop_column(column=added_column)
8673
8674            # Delete dataframe
8675            del dataframe_trio
8676            gc.collect()
8677
8678    def calculation_vaf_normalization(self) -> None:
8679        """
8680        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
8681        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
8682        :return: The function does not return anything.
8683        """
8684
8685        # if FORMAT and samples
8686        if (
8687            "FORMAT" in self.get_header_columns_as_list()
8688            and self.get_header_sample_list()
8689        ):
8690
8691            # vaf_normalization annotation field
8692            vaf_normalization_tag = "VAF"
8693
8694            # VCF infos tags
8695            vcf_infos_tags = {
8696                "VAF": "VAF Variant Frequency",
8697            }
8698
8699            # Prefix
8700            prefix = self.get_explode_infos_prefix()
8701
8702            # Variants table
8703            table_variants = self.get_table_variants()
8704
8705            # Header
8706            vcf_reader = self.get_header()
8707
8708            # Do not calculate if VAF already exists
8709            if "VAF" in vcf_reader.formats:
8710                log.debug("VAF already on genotypes")
8711                return
8712
8713            # Create variant id
8714            variant_id_column = self.get_variant_id_column()
8715            added_columns = [variant_id_column]
8716
8717            # variant_id, FORMAT and samples
8718            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8719                self.get_header_sample_list()
8720            )
8721
8722            # Create dataframe
8723            dataframe_vaf_normalization = self.get_query_to_df(
8724                f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
8725            )
8726
8727            vaf_normalization_set = []
8728
8729            # for each sample vaf_normalization
8730            for sample in self.get_header_sample_list():
8731                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
8732                    lambda row: vaf_normalization(row, sample=sample), axis=1
8733                )
8734                vaf_normalization_set.append(
8735                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
8736                )
8737
8738            # Add VAF to FORMAT
8739            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
8740                "FORMAT"
8741            ].apply(lambda x: str(x) + ":VAF")
8742            vaf_normalization_set.append(
8743                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
8744            )
8745
8746            # Add vaf_normalization to header
8747            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
8748                id=vaf_normalization_tag,
8749                num="1",
8750                type="Float",
8751                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
8752                type_code=self.code_type_map.get("Float"),
8753            )
8754
8755            # Create fields to add in INFO
8756            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
8757
8758            # Update
8759            sql_update = f"""
8760                UPDATE {table_variants}
8761                SET {sql_vaf_normalization_set}
8762                FROM dataframe_vaf_normalization
8763                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
8764
8765            """
8766            self.conn.execute(sql_update)
8767
8768            # Remove added columns
8769            for added_column in added_columns:
8770                self.drop_column(column=added_column)
8771
8772            # Delete dataframe
8773            del dataframe_vaf_normalization
8774            gc.collect()
8775
8776    def calculation_genotype_stats(self, info: str = "VAF") -> None:
8777        """
8778        The `calculation_genotype_stats` function calculates genotype statistics for a given information
8779        field in a VCF file and updates the INFO column of the variants table with the calculated
8780        statistics.
8781
8782        :param info: The `info` parameter is a string that represents the type of information for which
8783        genotype statistics are calculated. It is used to generate various VCF info tags for the
8784        statistics, such as the number of occurrences, the list of values, the minimum value, the
8785        maximum value, the mean, the median, defaults to VAF
8786        :type info: str (optional)
8787        """
8788
8789        # if FORMAT and samples
8790        if (
8791            "FORMAT" in self.get_header_columns_as_list()
8792            and self.get_header_sample_list()
8793        ):
8794
8795            # vaf_stats annotation field
8796            vaf_stats_tag = info + "_stats"
8797
8798            # VCF infos tags
8799            vcf_infos_tags = {
8800                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
8801                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
8802                info + "_stats_min": f"genotype {info} Statistics - min {info}",
8803                info + "_stats_max": f"genotype {info} Statistics - max {info}",
8804                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
8805                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
8806                info
8807                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
8808            }
8809
8810            # Prefix
8811            prefix = self.get_explode_infos_prefix()
8812
8813            # Field
8814            vaf_stats_infos = prefix + vaf_stats_tag
8815
8816            # Variants table
8817            table_variants = self.get_table_variants()
8818
8819            # Header
8820            vcf_reader = self.get_header()
8821
8822            # Create variant id
8823            variant_id_column = self.get_variant_id_column()
8824            added_columns = [variant_id_column]
8825
8826            # variant_id, FORMAT and samples
8827            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8828                self.get_header_sample_list()
8829            )
8830
8831            # Create dataframe
8832            dataframe_vaf_stats = self.get_query_to_df(
8833                f""" SELECT {samples_fields} FROM {table_variants} """
8834            )
8835
8836            # Create vaf_stats column
8837            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
8838                lambda row: genotype_stats(
8839                    row, samples=self.get_header_sample_list(), info=info
8840                ),
8841                axis=1,
8842            )
8843
8844            # List of vcf tags
8845            sql_vaf_stats_fields = []
8846
8847            # Check all VAF stats infos
8848            for stat in vcf_infos_tags:
8849
8850                # Extract stats
8851                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
8852                    lambda x: dict(x).get(stat, "")
8853                )
8854
8855                # Add snpeff_hgvs to header
8856                vcf_reader.infos[stat] = vcf.parser._Info(
8857                    stat,
8858                    ".",
8859                    "String",
8860                    vcf_infos_tags.get(stat, "genotype statistics"),
8861                    "howard calculation",
8862                    "0",
8863                    self.code_type_map.get("String"),
8864                )
8865
8866                if len(sql_vaf_stats_fields):
8867                    sep = ";"
8868                else:
8869                    sep = ""
8870
8871                # Create fields to add in INFO
8872                sql_vaf_stats_fields.append(
8873                    f"""
8874                        CASE
8875                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
8876                            THEN concat(
8877                                    '{sep}{stat}=',
8878                                    dataframe_vaf_stats."{stat}"
8879                                )
8880                            ELSE ''
8881                        END
8882                    """
8883                )
8884
8885            # SQL set for update
8886            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
8887
8888            # Update
8889            sql_update = f"""
8890                UPDATE variants
8891                SET "INFO" = 
8892                    concat(
8893                        CASE
8894                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8895                            THEN ''
8896                            ELSE concat("INFO", ';')
8897                        END,
8898                        {sql_vaf_stats_fields_set}
8899                    )
8900                FROM dataframe_vaf_stats
8901                WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
8902
8903            """
8904            self.conn.execute(sql_update)
8905
8906            # Remove added columns
8907            for added_column in added_columns:
8908                self.drop_column(column=added_column)
8909
8910            # Delete dataframe
8911            del dataframe_vaf_stats
8912            gc.collect()
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
36    def __init__(
37        self,
38        conn=None,
39        input: str = None,
40        output: str = None,
41        config: dict = {},
42        param: dict = {},
43        load: bool = False,
44    ) -> None:
45        """
46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
47        header
48
49        :param conn: the connection to the database
50        :param input: the input file
51        :param output: the output file
52        :param config: a dictionary containing the configuration of the model
53        :param param: a dictionary containing the parameters of the model
54        """
55
56        # Init variables
57        self.init_variables()
58
59        # Input
60        self.set_input(input)
61
62        # Config
63        self.set_config(config)
64
65        # Param
66        self.set_param(param)
67
68        # Output
69        self.set_output(output)
70
71        # connexion
72        self.set_connexion(conn)
73
74        # Header
75        self.set_header()
76
77        # Load data
78        if load:
79            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_input(self, input: str = None) -> None:
 81    def set_input(self, input: str = None) -> None:
 82        """
 83        The function takes a file name as input, splits the file name into a name and an extension, and
 84        then sets the input_name, input_extension, and input_format attributes of the class
 85
 86        :param input: The input file
 87        """
 88
 89        if input and not isinstance(input, str):
 90            try:
 91                self.input = input.name
 92            except:
 93                log.error(f"Input file '{input} in bad format")
 94                raise ValueError(f"Input file '{input} in bad format")
 95        else:
 96            self.input = input
 97
 98        # Input format
 99        if input:
100            input_name, input_extension = os.path.splitext(self.input)
101            self.input_name = input_name
102            self.input_extension = input_extension
103            self.input_format = self.input_extension.replace(".", "")

The function takes a file name as input, splits the file name into a name and an extension, and then sets the input_name, input_extension, and input_format attributes of the class

Parameters
  • input: The input file
def set_config(self, config: dict) -> None:
105    def set_config(self, config: dict) -> None:
106        """
107        This function takes in a config object and sets it as the config object for the class
108
109        :param config: The configuration object
110        """
111        self.config = config

This function takes in a config object and sets it as the config object for the class

Parameters
  • config: The configuration object
def set_param(self, param: dict) -> None:
113    def set_param(self, param: dict) -> None:
114        """
115        This function takes in a param object and sets it as the param object for the class
116
117        :param param: The paramters object
118        """
119        self.param = param

This function takes in a param object and sets it as the param object for the class

Parameters
  • param: The paramters object
def init_variables(self) -> None:
121    def init_variables(self) -> None:
122        """
123        This function initializes the variables that will be used in the rest of the class
124        """
125        self.prefix = "howard"
126        self.table_variants = "variants"
127        self.dataframe = None
128
129        self.comparison_map = {
130            "gt": ">",
131            "gte": ">=",
132            "lt": "<",
133            "lte": "<=",
134            "equals": "=",
135            "contains": "SIMILAR TO",
136        }
137
138        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
139
140        self.code_type_map_to_sql = {
141            "Integer": "INTEGER",
142            "String": "VARCHAR",
143            "Float": "FLOAT",
144            "Flag": "VARCHAR",
145        }
146
147        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
149    def get_indexing(self) -> bool:
150        """
151        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
152        returns False.
153        :return: The value of the indexing parameter.
154        """
155        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
157    def get_connexion_config(self) -> dict:
158        """
159        The function `get_connexion_config` returns a dictionary containing the configuration for a
160        connection, including the number of threads and memory limit.
161        :return: a dictionary containing the configuration for the Connexion library.
162        """
163
164        # config
165        config = self.get_config()
166
167        # Connexion config
168        connexion_config = {}
169        threads = self.get_threads()
170
171        # Threads
172        if threads:
173            connexion_config["threads"] = threads
174
175        # Memory
176        # if config.get("memory", None):
177        #     connexion_config["memory_limit"] = config.get("memory")
178        if self.get_memory():
179            connexion_config["memory_limit"] = self.get_memory()
180
181        # Temporary directory
182        if config.get("tmp", None):
183            connexion_config["temp_directory"] = config.get("tmp")
184
185        # Access
186        if config.get("access", None):
187            access = config.get("access")
188            if access in ["RO"]:
189                access = "READ_ONLY"
190            elif access in ["RW"]:
191                access = "READ_WRITE"
192            connexion_db = self.get_connexion_db()
193            if connexion_db in ":memory:":
194                access = "READ_WRITE"
195            connexion_config["access_mode"] = access
196
197        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
199    def get_duckdb_settings(self) -> dict:
200        """
201        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
202        string.
203        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
204        """
205
206        # config
207        config = self.get_config()
208
209        # duckdb settings
210        duckdb_settings_dict = {}
211        if config.get("duckdb_settings", None):
212            duckdb_settings = config.get("duckdb_settings")
213            duckdb_settings = full_path(duckdb_settings)
214            # duckdb setting is a file
215            if os.path.exists(duckdb_settings):
216                with open(duckdb_settings) as json_file:
217                    duckdb_settings_dict = yaml.safe_load(json_file)
218            # duckdb settings is a string
219            else:
220                duckdb_settings_dict = json.loads(duckdb_settings)
221
222        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
224    def set_connexion_db(self) -> str:
225        """
226        The function `set_connexion_db` returns the appropriate database connection string based on the
227        input format and connection type.
228        :return: the value of the variable `connexion_db`.
229        """
230
231        # Default connexion db
232        default_connexion_db = ":memory:"
233
234        # Find connexion db
235        if self.get_input_format() in ["db", "duckdb"]:
236            connexion_db = self.get_input()
237        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
238            connexion_db = default_connexion_db
239        elif self.get_connexion_type() in ["tmpfile"]:
240            tmp_name = tempfile.mkdtemp(
241                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
242            )
243            connexion_db = f"{tmp_name}/tmp.db"
244        elif self.get_connexion_type() != "":
245            connexion_db = self.get_connexion_type()
246        else:
247            connexion_db = default_connexion_db
248
249        # Set connexion db
250        self.connexion_db = connexion_db
251
252        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
254    def set_connexion(self, conn) -> None:
255        """
256        It creates a connection to the database
257
258        :param conn: The connection to the database. If not provided, a new connection to an in-memory
259        database is created
260        """
261
262        # Connexion db
263        connexion_db = self.set_connexion_db()
264
265        # Connexion config
266        connexion_config = self.get_connexion_config()
267
268        # Connexion format
269        connexion_format = self.get_config().get("connexion_format", "duckdb")
270        # Set connexion format
271        self.connexion_format = connexion_format
272
273        # Connexion
274        if not conn:
275            if connexion_format in ["duckdb"]:
276                conn = duckdb.connect(connexion_db, config=connexion_config)
277                # duckDB settings
278                duckdb_settings = self.get_duckdb_settings()
279                if duckdb_settings:
280                    for setting in duckdb_settings:
281                        setting_value = duckdb_settings.get(setting)
282                        if isinstance(setting_value, str):
283                            setting_value = f"'{setting_value}'"
284                        conn.execute(f"PRAGMA {setting}={setting_value};")
285            elif connexion_format in ["sqlite"]:
286                conn = sqlite3.connect(connexion_db)
287
288        # Set connexion
289        self.conn = conn
290
291        # Log
292        log.debug(f"connexion_format: {connexion_format}")
293        log.debug(f"connexion_db: {connexion_db}")
294        log.debug(f"connexion config: {connexion_config}")
295        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

It creates a connection to the database

Parameters
  • conn: The connection to the database. If not provided, a new connection to an in-memory database is created
def set_output(self, output: str = None) -> None:
297    def set_output(self, output: str = None) -> None:
298        """
299        If the config file has an output key, set the output to the value of that key. Otherwise, set
300        the output to the input
301
302        :param output: The name of the output file
303        """
304
305        if output and not isinstance(output, str):
306            self.output = output.name
307        else:
308            self.output = output
309
310        # Output format
311        if self.output:
312            output_name, output_extension = os.path.splitext(self.output)
313            self.output_name = output_name
314            self.output_extension = output_extension
315            self.output_format = self.output_extension.replace(".", "")
316        else:
317            self.output_name = None
318            self.output_extension = None
319            self.output_format = None

If the config file has an output key, set the output to the value of that key. Otherwise, set the output to the input

Parameters
  • output: The name of the output file
def set_header(self) -> None:
321    def set_header(self) -> None:
322        """
323        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
324        """
325
326        input_file = self.get_input()
327        default_header_list = [
328            "##fileformat=VCFv4.2",
329            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
330        ]
331
332        # Full path
333        input_file = full_path(input_file)
334
335        if input_file:
336
337            input_format = self.get_input_format()
338            input_compressed = self.get_input_compressed()
339            config = self.get_config()
340            header_list = default_header_list
341            if input_format in [
342                "vcf",
343                "hdr",
344                "tsv",
345                "csv",
346                "psv",
347                "parquet",
348                "db",
349                "duckdb",
350            ]:
351                # header provided in param
352                if config.get("header_file", None):
353                    with open(config.get("header_file"), "rt") as f:
354                        header_list = self.read_vcf_header(f)
355                # within a vcf file format (header within input file itsself)
356                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
357                    # within a compressed vcf file format (.vcf.gz)
358                    if input_compressed:
359                        with bgzf.open(input_file, "rt") as f:
360                            header_list = self.read_vcf_header(f)
361                    # within an uncompressed vcf file format (.vcf)
362                    else:
363                        with open(input_file, "rt") as f:
364                            header_list = self.read_vcf_header(f)
365                # header provided in default external file .hdr
366                elif os.path.exists((input_file + ".hdr")):
367                    with open(input_file + ".hdr", "rt") as f:
368                        header_list = self.read_vcf_header(f)
369                else:
370                    try:  # Try to get header info fields and file columns
371
372                        with tempfile.TemporaryDirectory() as tmpdir:
373
374                            # Create database
375                            db_for_header = Database(database=input_file)
376
377                            # Get header columns for infos fields
378                            db_header_from_columns = (
379                                db_for_header.get_header_from_columns()
380                            )
381
382                            # Get real columns in the file
383                            db_header_columns = db_for_header.get_columns()
384
385                            # Write header file
386                            header_file_tmp = os.path.join(tmpdir, "header")
387                            f = open(header_file_tmp, "w")
388                            vcf.Writer(f, db_header_from_columns)
389                            f.close()
390
391                            # Replace #CHROM line with rel columns
392                            header_list = db_for_header.read_header_file(
393                                header_file=header_file_tmp
394                            )
395                            header_list[-1] = "\t".join(db_header_columns)
396
397                    except:
398
399                        log.warning(
400                            f"No header for file {input_file}. Set as default VCF header"
401                        )
402                        header_list = default_header_list
403
404            else:  # try for unknown format ?
405
406                log.error(f"Input file format '{input_format}' not available")
407                raise ValueError(f"Input file format '{input_format}' not available")
408
409            if not header_list:
410                header_list = default_header_list
411
412            # header as list
413            self.header_list = header_list
414
415            # header as VCF object
416            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
417
418        else:
419
420            self.header_list = None
421            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
423    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
424        """
425        > The function `get_query_to_df` takes a query as a string and returns a pandas dataframe
426
427        :param query: str = ""
428        :type query: str
429        :return: A dataframe
430        """
431
432        # Connexion format
433        connexion_format = self.get_connexion_format()
434
435        # Limit in query
436        if limit:
437            pd.set_option("display.max_rows", limit)
438            if connexion_format in ["duckdb"]:
439                df = (
440                    self.conn.execute(query)
441                    .fetch_record_batch(limit)
442                    .read_next_batch()
443                    .to_pandas()
444                )
445            elif connexion_format in ["sqlite"]:
446                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
447
448        # Full query
449        else:
450            if connexion_format in ["duckdb"]:
451                df = self.conn.execute(query).df()
452            elif connexion_format in ["sqlite"]:
453                df = pd.read_sql_query(query, self.conn)
454
455        return df

The function get_query_to_df takes a query as a string and returns a pandas dataframe

Parameters
  • query: str = ""
Returns

A dataframe

def get_overview(self) -> None:
457    def get_overview(self) -> None:
458        """
459        The function prints the input, output, config, and dataframe of the current object
460        """
461        table_variants_from = self.get_table_variants(clause="from")
462        sql_columns = self.get_header_columns_as_sql()
463        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
464        df = self.get_query_to_df(sql_query_export)
465        log.info(
466            "Input:  "
467            + str(self.get_input())
468            + " ["
469            + str(str(self.get_input_format()))
470            + "]"
471        )
472        log.info(
473            "Output: "
474            + str(self.get_output())
475            + " ["
476            + str(str(self.get_output_format()))
477            + "]"
478        )
479        log.info("Config: ")
480        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
481            "\n"
482        ):
483            log.info("\t" + str(d))
484        log.info("Param: ")
485        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
486            "\n"
487        ):
488            log.info("\t" + str(d))
489        log.info("Sample list: " + str(self.get_header_sample_list()))
490        log.info("Dataframe: ")
491        for d in str(df).split("\n"):
492            log.info("\t" + str(d))
493
494        # garbage collector
495        del df
496        gc.collect()
497
498        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
500    def get_stats(self) -> dict:
501        """
502        The `get_stats` function calculates and returns various statistics of the current object,
503        including information about the input file, variants, samples, header fields, quality, and
504        SNVs/InDels.
505        :return: a dictionary containing various statistics of the current object. The dictionary has
506        the following structure:
507        """
508
509        # Log
510        log.info(f"Stats Calculation...")
511
512        # table varaints
513        table_variants_from = self.get_table_variants()
514
515        # stats dict
516        stats = {"Infos": {}}
517
518        ### File
519        input_file = self.get_input()
520        stats["Infos"]["Input file"] = input_file
521
522        # Header
523        header_infos = self.get_header().infos
524        header_formats = self.get_header().formats
525        header_infos_list = list(header_infos)
526        header_formats_list = list(header_formats)
527
528        ### Variants
529
530        stats["Variants"] = {}
531
532        # Variants by chr
533        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
534        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
535        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
536            by=["CHROM"], kind="quicksort"
537        )
538
539        # Total number of variants
540        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
541
542        # Calculate percentage
543        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
544            lambda x: (x / nb_of_variants)
545        )
546
547        stats["Variants"]["Number of variants by chromosome"] = (
548            nb_of_variants_by_chrom.to_dict(orient="index")
549        )
550
551        stats["Infos"]["Number of variants"] = int(nb_of_variants)
552
553        ### Samples
554
555        # Init
556        samples = {}
557        nb_of_samples = 0
558
559        # Check Samples
560        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
561            log.debug(f"Check samples...")
562            for sample in self.get_header_sample_list():
563                sql_query_samples = f"""
564                    SELECT  '{sample}' as sample,
565                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
566                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
567                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
568                    FROM {table_variants_from}
569                    WHERE (
570                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
571                        AND
572                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
573                      )
574                    GROUP BY genotype
575                    """
576                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
577                sample_genotype_count = sql_query_genotype_df["count"].sum()
578                if len(sql_query_genotype_df):
579                    nb_of_samples += 1
580                    samples[f"{sample} - {sample_genotype_count} variants"] = (
581                        sql_query_genotype_df.to_dict(orient="index")
582                    )
583
584            stats["Samples"] = samples
585            stats["Infos"]["Number of samples"] = nb_of_samples
586
587        # #
588        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
589        #     stats["Infos"]["Number of samples"] = nb_of_samples
590        # elif nb_of_samples:
591        #     stats["Infos"]["Number of samples"] = "not a VCF format"
592
593        ### INFO and FORMAT fields
594        header_types_df = {}
595        header_types_list = {
596            "List of INFO fields": header_infos,
597            "List of FORMAT fields": header_formats,
598        }
599        i = 0
600        for header_type in header_types_list:
601
602            header_type_infos = header_types_list.get(header_type)
603            header_infos_dict = {}
604
605            for info in header_type_infos:
606
607                i += 1
608                header_infos_dict[i] = {}
609
610                # ID
611                header_infos_dict[i]["id"] = info
612
613                # num
614                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
615                if header_type_infos[info].num in genotype_map.keys():
616                    header_infos_dict[i]["Number"] = genotype_map.get(
617                        header_type_infos[info].num
618                    )
619                else:
620                    header_infos_dict[i]["Number"] = header_type_infos[info].num
621
622                # type
623                if header_type_infos[info].type:
624                    header_infos_dict[i]["Type"] = header_type_infos[info].type
625                else:
626                    header_infos_dict[i]["Type"] = "."
627
628                # desc
629                if header_type_infos[info].desc != None:
630                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
631                else:
632                    header_infos_dict[i]["Description"] = ""
633
634            if len(header_infos_dict):
635                header_types_df[header_type] = pd.DataFrame.from_dict(
636                    header_infos_dict, orient="index"
637                ).to_dict(orient="index")
638
639        # Stats
640        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
641        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
642        stats["Header"] = header_types_df
643
644        ### QUAL
645        if "QUAL" in self.get_header_columns():
646            sql_query_qual = f"""
647                    SELECT
648                        avg(CAST(QUAL AS INTEGER)) AS Average,
649                        min(CAST(QUAL AS INTEGER)) AS Minimum,
650                        max(CAST(QUAL AS INTEGER)) AS Maximum,
651                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
652                        median(CAST(QUAL AS INTEGER)) AS Median,
653                        variance(CAST(QUAL AS INTEGER)) AS Variance
654                    FROM {table_variants_from}
655                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
656                    """
657
658            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
659            stats["Quality"] = {"Stats": qual}
660
661        ### SNV and InDel
662
663        sql_query_snv = f"""
664            
665            SELECT Type, count FROM (
666
667                    SELECT
668                        'Total' AS Type,
669                        count(*) AS count
670                    FROM {table_variants_from}
671
672                    UNION
673
674                    SELECT
675                        'MNV' AS Type,
676                        count(*) AS count
677                    FROM {table_variants_from}
678                    WHERE len(REF) > 1 AND len(ALT) > 1
679                    AND len(REF) = len(ALT)
680
681                    UNION
682
683                    SELECT
684                        'InDel' AS Type,
685                        count(*) AS count
686                    FROM {table_variants_from}
687                    WHERE len(REF) > 1 OR len(ALT) > 1
688                    AND len(REF) != len(ALT)
689                    
690                    UNION
691
692                    SELECT
693                        'SNV' AS Type,
694                        count(*) AS count
695                    FROM {table_variants_from}
696                    WHERE len(REF) = 1 AND len(ALT) = 1
697
698                )
699
700            ORDER BY count DESC
701
702                """
703        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
704
705        sql_query_snv_substitution = f"""
706                SELECT
707                    concat(REF, '>', ALT) AS 'Substitution',
708                    count(*) AS count
709                FROM {table_variants_from}
710                WHERE len(REF) = 1 AND len(ALT) = 1
711                GROUP BY REF, ALT
712                ORDER BY count(*) DESC
713                """
714        snv_substitution = (
715            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
716        )
717        stats["Variants"]["Counts"] = snv_indel
718        stats["Variants"]["Substitutions"] = snv_substitution
719
720        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
722    def stats_to_file(self, file: str = None) -> str:
723        """
724        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
725        into a JSON object, and writes the JSON object to the specified file.
726
727        :param file: The `file` parameter is a string that represents the file path where the JSON data
728        will be written
729        :type file: str
730        :return: the name of the file that was written to.
731        """
732
733        # Get stats
734        stats = self.get_stats()
735
736        # Serializing json
737        json_object = json.dumps(stats, indent=4)
738
739        # Writing to sample.json
740        with open(file, "w") as outfile:
741            outfile.write(json_object)
742
743        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
745    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
746        """
747        The `print_stats` function generates a markdown file and prints the statistics contained in a
748        JSON file in a formatted manner.
749
750        :param output_file: The `output_file` parameter is a string that specifies the path and filename
751        of the output file where the stats will be printed in Markdown format. If no `output_file` is
752        provided, a temporary directory will be created and the stats will be saved in a file named
753        "stats.md" within that
754        :type output_file: str
755        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
756        file where the statistics will be saved. If no value is provided, a temporary directory will be
757        created and a default file name "stats.json" will be used
758        :type json_file: str
759        :return: The function `print_stats` does not return any value. It has a return type annotation
760        of `None`.
761        """
762
763        # Full path
764        output_file = full_path(output_file)
765        json_file = full_path(json_file)
766
767        with tempfile.TemporaryDirectory() as tmpdir:
768
769            # Files
770            if not output_file:
771                output_file = os.path.join(tmpdir, "stats.md")
772            if not json_file:
773                json_file = os.path.join(tmpdir, "stats.json")
774
775            # Create folders
776            if not os.path.exists(os.path.dirname(output_file)):
777                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
778            if not os.path.exists(os.path.dirname(json_file)):
779                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
780
781            # Create stats JSON file
782            stats_file = self.stats_to_file(file=json_file)
783
784            # Print stats file
785            with open(stats_file) as f:
786                stats = yaml.safe_load(f)
787
788            # Output
789            output_title = []
790            output_index = []
791            output = []
792
793            # Title
794            output_title.append("# HOWARD Stats")
795
796            # Index
797            output_index.append("## Index")
798
799            # Process sections
800            for section in stats:
801                infos = stats.get(section)
802                section_link = "#" + section.lower().replace(" ", "-")
803                output.append(f"## {section}")
804                output_index.append(f"- [{section}]({section_link})")
805
806                if len(infos):
807                    for info in infos:
808                        try:
809                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
810                            is_df = True
811                        except:
812                            try:
813                                df = pd.DataFrame.from_dict(
814                                    json.loads((infos.get(info))), orient="index"
815                                )
816                                is_df = True
817                            except:
818                                is_df = False
819                        if is_df:
820                            output.append(f"### {info}")
821                            info_link = "#" + info.lower().replace(" ", "-")
822                            output_index.append(f"   - [{info}]({info_link})")
823                            output.append(f"{df.to_markdown(index=False)}")
824                        else:
825                            output.append(f"- {info}: {infos.get(info)}")
826                else:
827                    output.append(f"NA")
828
829            # Write stats in markdown file
830            with open(output_file, "w") as fp:
831                for item in output_title:
832                    fp.write("%s\n" % item)
833                for item in output_index:
834                    fp.write("%s\n" % item)
835                for item in output:
836                    fp.write("%s\n" % item)
837
838            # Output stats in markdown
839            print("")
840            print("\n\n".join(output_title))
841            print("")
842            print("\n\n".join(output))
843            print("")
844
845        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
847    def get_input(self) -> str:
848        """
849        It returns the value of the input variable.
850        :return: The input is being returned.
851        """
852        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
854    def get_input_format(self, input_file: str = None) -> str:
855        """
856        It returns the format of the input variable.
857        :return: The format is being returned.
858        """
859        if not input_file:
860            input_file = self.get_input()
861        input_format = get_file_format(input_file)
862        return input_format

It returns the format of the input variable.

Returns

The format is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
864    def get_input_compressed(self, input_file: str = None) -> str:
865        """
866        It returns the format of the input variable.
867        :return: The format is being returned.
868        """
869        if not input_file:
870            input_file = self.get_input()
871        input_compressed = get_file_compressed(input_file)
872        return input_compressed

It returns the format of the input variable.

Returns

The format is being returned.

def get_output(self) -> str:
874    def get_output(self) -> str:
875        """
876        It returns the output of the neuron.
877        :return: The output of the neural network.
878        """
879        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
881    def get_output_format(self, output_file: str = None) -> str:
882        """
883        It returns the format of the input variable.
884        :return: The format is being returned.
885        """
886        if not output_file:
887            output_file = self.get_output()
888        output_format = get_file_format(output_file)
889
890        return output_format

It returns the format of the input variable.

Returns

The format is being returned.

def get_config(self) -> dict:
892    def get_config(self) -> dict:
893        """
894        It returns the config
895        :return: The config variable is being returned.
896        """
897        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
899    def get_param(self) -> dict:
900        """
901        It returns the param
902        :return: The param variable is being returned.
903        """
904        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
906    def get_connexion_db(self) -> str:
907        """
908        It returns the connexion_db attribute of the object
909        :return: The connexion_db is being returned.
910        """
911        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
913    def get_prefix(self) -> str:
914        """
915        It returns the prefix of the object.
916        :return: The prefix is being returned.
917        """
918        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
920    def get_table_variants(self, clause: str = "select") -> str:
921        """
922        This function returns the table_variants attribute of the object
923
924        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
925        defaults to select (optional)
926        :return: The table_variants attribute of the object.
927        """
928
929        # Access
930        access = self.get_config().get("access", None)
931
932        # Clauses "select", "where", "update"
933        if clause in ["select", "where", "update"]:
934            table_variants = self.table_variants
935        # Clause "from"
936        elif clause in ["from"]:
937            # For Read Only
938            if self.get_input_format() in ["parquet"] and access in ["RO"]:
939                input_file = self.get_input()
940                table_variants = f"'{input_file}' as variants"
941            # For Read Write
942            else:
943                table_variants = f"{self.table_variants} as variants"
944        else:
945            table_variants = self.table_variants
946        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
948    def get_tmp_dir(self) -> str:
949        """
950        The function `get_tmp_dir` returns the temporary directory path based on configuration
951        parameters or a default path.
952        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
953        configuration, parameters, and a default value of "/tmp".
954        """
955
956        return get_tmp(
957            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
958        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
960    def get_connexion_type(self) -> str:
961        """
962        If the connexion type is not in the list of allowed connexion types, raise a ValueError
963
964        :return: The connexion type is being returned.
965        """
966        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
968    def get_connexion(self):
969        """
970        It returns the connection object
971
972        :return: The connection object.
973        """
974        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
976    def close_connexion(self) -> None:
977        """
978        This function closes the connection to the database.
979        :return: The connection is being closed.
980        """
981        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
 983    def get_header(self, type: str = "vcf"):
 984        """
 985        This function returns the header of the VCF file as a list of strings
 986
 987        :param type: the type of header you want to get, defaults to vcf (optional)
 988        :return: The header of the vcf file.
 989        """
 990
 991        if self.header_vcf:
 992            if type == "vcf":
 993                return self.header_vcf
 994            elif type == "list":
 995                return self.header_list
 996        else:
 997            if type == "vcf":
 998                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 999                return header
1000            elif type == "list":
1001                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_length(self, file: str = None) -> int:
1003    def get_header_length(self, file: str = None) -> int:
1004        """
1005        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1006        line.
1007
1008        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1009        header file. If this argument is provided, the function will read the header from the specified
1010        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1011        :type file: str
1012        :return: the length of the header list, excluding the #CHROM line.
1013        """
1014
1015        if file:
1016            return len(self.read_vcf_header_file(file=file)) - 1
1017        elif self.get_header(type="list"):
1018            return len(self.get_header(type="list")) - 1
1019        else:
1020            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1022    def get_header_columns(self) -> str:
1023        """
1024        This function returns the header list of a VCF
1025
1026        :return: The length of the header list.
1027        """
1028        if self.get_header():
1029            return self.get_header(type="list")[-1]
1030        else:
1031            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1033    def get_header_columns_as_list(self) -> list:
1034        """
1035        This function returns the header list of a VCF
1036
1037        :return: The length of the header list.
1038        """
1039        if self.get_header():
1040            return self.get_header_columns().strip().split("\t")
1041        else:
1042            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1044    def get_header_columns_as_sql(self) -> str:
1045        """
1046        This function retruns header length (without #CHROM line)
1047
1048        :return: The length of the header list.
1049        """
1050        sql_column_list = []
1051        for col in self.get_header_columns_as_list():
1052            sql_column_list.append(f'"{col}"')
1053        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list(self) -> list:
1055    def get_header_sample_list(self) -> list:
1056        """
1057        This function retruns header length (without #CHROM line)
1058
1059        :return: The length of the header list.
1060        """
1061        return self.header_vcf.samples

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_verbose(self) -> bool:
1063    def get_verbose(self) -> bool:
1064        """
1065        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1066        exist
1067
1068        :return: The value of the key "verbose" in the config dictionary.
1069        """
1070        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1072    def get_connexion_format(self) -> str:
1073        """
1074        It returns the connexion format of the object.
1075        :return: The connexion_format is being returned.
1076        """
1077        connexion_format = self.connexion_format
1078        if connexion_format not in ["duckdb", "sqlite"]:
1079            log.error(f"Unknown connexion format {connexion_format}")
1080            raise ValueError(f"Unknown connexion format {connexion_format}")
1081        else:
1082            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1084    def insert_file_to_table(
1085        self,
1086        file,
1087        columns: str,
1088        header_len: int = 0,
1089        sep: str = "\t",
1090        chunksize: int = 1000000,
1091    ) -> None:
1092        """
1093        The function reads a file in chunks, and inserts each chunk into a table
1094
1095        :param file: the file to be loaded
1096        :param columns: a string of the column names separated by commas
1097        :param header_len: the number of lines to skip at the beginning of the file, defaults to 0
1098        (optional)
1099        :param sep: the separator used in the file, defaults to \t (optional)
1100        :param chunksize: The number of rows to read in at a time, defaults to 1000000 (optional)
1101        """
1102
1103        # Config
1104        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1105        connexion_format = self.get_connexion_format()
1106
1107        log.debug("chunksize: " + str(chunksize))
1108
1109        if chunksize:
1110            for chunk in pd.read_csv(
1111                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1112            ):
1113                if connexion_format in ["duckdb"]:
1114                    sql_insert_into = (
1115                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1116                    )
1117                    self.conn.execute(sql_insert_into)
1118                elif connexion_format in ["sqlite"]:
1119                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks, and inserts each chunk into a table

Parameters
  • file: the file to be loaded
  • columns: a string of the column names separated by commas
  • header_len: the number of lines to skip at the beginning of the file, defaults to 0 (optional)
  • sep: the separator used in the file, defaults to (optional)
  • chunksize: The number of rows to read in at a time, defaults to 1000000 (optional)
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1121    def load_data(
1122        self,
1123        input_file: str = None,
1124        drop_variants_table: bool = False,
1125        sample_size: int = 20480,
1126    ) -> None:
1127        """
1128        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1129        table before loading the data and specify a sample size.
1130
1131        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1132        table
1133        :type input_file: str
1134        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1135        determines whether the variants table should be dropped before loading the data. If set to
1136        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1137        not be dropped, defaults to False
1138        :type drop_variants_table: bool (optional)
1139        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1140        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1141        20480
1142        :type sample_size: int (optional)
1143        """
1144
1145        log.info("Loading...")
1146
1147        # change input file
1148        if input_file:
1149            self.set_input(input_file)
1150            self.set_header()
1151
1152        # drop variants table
1153        if drop_variants_table:
1154            self.drop_variants_table()
1155
1156        # get table variants
1157        table_variants = self.get_table_variants()
1158
1159        # Access
1160        access = self.get_config().get("access", None)
1161        log.debug(f"access: {access}")
1162
1163        # Input format and compress
1164        input_format = self.get_input_format()
1165        input_compressed = self.get_input_compressed()
1166        log.debug(f"input_format: {input_format}")
1167        log.debug(f"input_compressed: {input_compressed}")
1168
1169        # input_compressed_format
1170        if input_compressed:
1171            input_compressed_format = "gzip"
1172        else:
1173            input_compressed_format = "none"
1174        log.debug(f"input_compressed_format: {input_compressed_format}")
1175
1176        # Connexion format
1177        connexion_format = self.get_connexion_format()
1178
1179        # Sample size
1180        if not sample_size:
1181            sample_size = -1
1182        log.debug(f"sample_size: {sample_size}")
1183
1184        # Load data
1185        log.debug(f"Load Data from {input_format}")
1186
1187        # DuckDB connexion
1188        if connexion_format in ["duckdb"]:
1189
1190            # Database already exists
1191            if self.input_format in ["db", "duckdb"]:
1192
1193                if connexion_format in ["duckdb"]:
1194                    log.debug(f"Input file format '{self.input_format}' duckDB")
1195                else:
1196                    log.error(
1197                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1198                    )
1199                    raise ValueError(
1200                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1201                    )
1202
1203            # Load from existing database format
1204            else:
1205
1206                try:
1207                    # Create Table or View
1208                    database = Database(database=self.input)
1209                    sql_from = database.get_sql_from(sample_size=sample_size)
1210
1211                    if access in ["RO"]:
1212                        sql_load = (
1213                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1214                        )
1215                    else:
1216                        sql_load = (
1217                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1218                        )
1219                    self.conn.execute(sql_load)
1220
1221                except:
1222                    # Format not available
1223                    log.error(f"Input file format '{self.input_format}' not available")
1224                    raise ValueError(
1225                        f"Input file format '{self.input_format}' not available"
1226                    )
1227
1228        # SQLite connexion
1229        elif connexion_format in ["sqlite"] and input_format in [
1230            "vcf",
1231            "tsv",
1232            "csv",
1233            "psv",
1234        ]:
1235
1236            # Main structure
1237            structure = {
1238                "#CHROM": "VARCHAR",
1239                "POS": "INTEGER",
1240                "ID": "VARCHAR",
1241                "REF": "VARCHAR",
1242                "ALT": "VARCHAR",
1243                "QUAL": "VARCHAR",
1244                "FILTER": "VARCHAR",
1245                "INFO": "VARCHAR",
1246            }
1247
1248            # Strcuture with samples
1249            structure_complete = structure
1250            if self.get_header_sample_list():
1251                structure["FORMAT"] = "VARCHAR"
1252                for sample in self.get_header_sample_list():
1253                    structure_complete[sample] = "VARCHAR"
1254
1255            # Columns list for create and insert
1256            sql_create_table_columns = []
1257            sql_create_table_columns_list = []
1258            for column in structure_complete:
1259                column_type = structure_complete[column]
1260                sql_create_table_columns.append(
1261                    f'"{column}" {column_type} default NULL'
1262                )
1263                sql_create_table_columns_list.append(f'"{column}"')
1264
1265            # Create database
1266            log.debug(f"Create Table {table_variants}")
1267            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1268            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1269            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1270            self.conn.execute(sql_create_table)
1271
1272            # chunksize define length of file chunk load file
1273            chunksize = 100000
1274
1275            # delimiter
1276            delimiter = file_format_delimiters.get(input_format, "\t")
1277
1278            # Load the input file
1279            with open(self.input, "rt") as input_file:
1280
1281                # Use the appropriate file handler based on the input format
1282                if input_compressed:
1283                    input_file = bgzf.open(self.input, "rt")
1284                if input_format in ["vcf"]:
1285                    header_len = self.get_header_length()
1286                else:
1287                    header_len = 0
1288
1289                # Insert the file contents into a table
1290                self.insert_file_to_table(
1291                    input_file,
1292                    columns=sql_create_table_columns_list_sql,
1293                    header_len=header_len,
1294                    sep=delimiter,
1295                    chunksize=chunksize,
1296                )
1297
1298        else:
1299            log.error(
1300                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1301            )
1302            raise ValueError(
1303                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1304            )
1305
1306        # Explode INFOS fields into table fields
1307        if self.get_explode_infos():
1308            self.explode_infos(
1309                prefix=self.get_explode_infos_prefix(),
1310                fields=self.get_explode_infos_fields(),
1311                force=True,
1312            )
1313
1314        # Create index after insertion
1315        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1317    def get_explode_infos(self) -> bool:
1318        """
1319        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1320        to False if it is not set.
1321        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1322        value. If the parameter is not present, it will return False.
1323        """
1324
1325        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1327    def get_explode_infos_fields(
1328        self,
1329        explode_infos_fields: str = None,
1330        remove_fields_not_in_header: bool = False,
1331    ) -> list:
1332        """
1333        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1334        the input parameter `explode_infos_fields`.
1335
1336        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1337        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1338        comma-separated list of field names to explode
1339        :type explode_infos_fields: str
1340        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1341        flag that determines whether to remove fields that are not present in the header. If it is set
1342        to `True`, any field that is not in the header will be excluded from the list of exploded
1343        information fields. If it is set to `, defaults to False
1344        :type remove_fields_not_in_header: bool (optional)
1345        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1346        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1347        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1348        Otherwise, it returns a list of exploded information fields after removing any spaces and
1349        splitting the string by commas.
1350        """
1351
1352        # If no fields, get it in param
1353        if not explode_infos_fields:
1354            explode_infos_fields = (
1355                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1356            )
1357
1358        # If no fields, defined as all fields in header using keyword
1359        if not explode_infos_fields:
1360            explode_infos_fields = "*"
1361
1362        # If fields list not empty
1363        if explode_infos_fields:
1364
1365            # Input fields list
1366            if isinstance(explode_infos_fields, str):
1367                fields_input = explode_infos_fields.split(",")
1368            elif isinstance(explode_infos_fields, list):
1369                fields_input = explode_infos_fields
1370            else:
1371                fields_input = []
1372
1373            # Fields list without * keyword
1374            fields_without_all = fields_input.copy()
1375            if "*".casefold() in (item.casefold() for item in fields_without_all):
1376                fields_without_all.remove("*")
1377
1378            # Fields in header
1379            fields_in_header = sorted(list(set(self.get_header().infos)))
1380
1381            # Construct list of fields
1382            fields_output = []
1383            for field in fields_input:
1384
1385                # Strip field
1386                field = field.strip()
1387
1388                # format keyword * in regex
1389                if field.upper() in ["*"]:
1390                    field = ".*"
1391
1392                # Find all fields with pattern
1393                r = re.compile(field)
1394                fields_search = sorted(list(filter(r.match, fields_in_header)))
1395
1396                # Remove fields input from search
1397                if fields_search != [field]:
1398                    fields_search = sorted(
1399                        list(set(fields_search).difference(fields_input))
1400                    )
1401
1402                # If field is not in header (avoid not well formatted header)
1403                if not fields_search and not remove_fields_not_in_header:
1404                    fields_search = [field]
1405
1406                # Add found fields
1407                for new_field in fields_search:
1408                    # Add field, if not already exists, and if it is in header (if asked)
1409                    if (
1410                        new_field not in fields_output
1411                        and (
1412                            not remove_fields_not_in_header
1413                            or new_field in fields_in_header
1414                        )
1415                        and new_field not in [".*"]
1416                    ):
1417                        fields_output.append(new_field)
1418
1419            return fields_output
1420
1421        else:
1422
1423            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1425    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1426        """
1427        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1428        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1429        not provided.
1430
1431        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1432        prefix to be used for exploding or expanding information
1433        :type explode_infos_prefix: str
1434        :return: the value of the variable `explode_infos_prefix`.
1435        """
1436
1437        if not explode_infos_prefix:
1438            explode_infos_prefix = (
1439                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1440            )
1441
1442        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1444    def add_column(
1445        self,
1446        table_name,
1447        column_name,
1448        column_type,
1449        default_value=None,
1450        drop: bool = False,
1451    ) -> dict:
1452        """
1453        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1454        doesn't already exist.
1455
1456        :param table_name: The name of the table to which you want to add a column
1457        :param column_name: The parameter "column_name" is the name of the column that you want to add
1458        to the table
1459        :param column_type: The `column_type` parameter specifies the data type of the column that you
1460        want to add to the table. It should be a string that represents the desired data type, such as
1461        "INTEGER", "TEXT", "REAL", etc
1462        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1463        default value for the newly added column. If a default value is provided, it will be assigned to
1464        the column for any existing rows that do not have a value for that column
1465        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1466        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1467        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1468        to False
1469        :type drop: bool (optional)
1470        :return: a boolean value indicating whether the column was successfully added to the table.
1471        """
1472
1473        # added
1474        added = False
1475        dropped = False
1476
1477        # Check if the column already exists in the table
1478        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1479        columns = self.get_query_to_df(query).columns.tolist()
1480        if column_name in columns:
1481            log.debug(
1482                f"The {column_name} column already exists in the {table_name} table"
1483            )
1484            if drop:
1485                self.drop_column(table_name=table_name, column_name=column_name)
1486                dropped = True
1487            else:
1488                return None
1489        else:
1490            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1491
1492        # Add column in table
1493        add_column_query = (
1494            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1495        )
1496        if default_value is not None:
1497            add_column_query += f" DEFAULT {default_value}"
1498        self.execute_query(add_column_query)
1499        added = not dropped
1500        log.debug(
1501            f"The {column_name} column was successfully added to the {table_name} table"
1502        )
1503
1504        if added:
1505            added_column = {
1506                "table_name": table_name,
1507                "column_name": column_name,
1508                "column_type": column_type,
1509                "default_value": default_value,
1510            }
1511        else:
1512            added_column = None
1513
1514        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1516    def drop_column(
1517        self, column: dict = None, table_name: str = None, column_name: str = None
1518    ) -> bool:
1519        """
1520        The `drop_column` function drops a specified column from a given table in a database and returns
1521        True if the column was successfully dropped, and False if the column does not exist in the
1522        table.
1523
1524        :param column: The `column` parameter is a dictionary that contains information about the column
1525        you want to drop. It has two keys:
1526        :type column: dict
1527        :param table_name: The `table_name` parameter is the name of the table from which you want to
1528        drop a column
1529        :type table_name: str
1530        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1531        from the table
1532        :type column_name: str
1533        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1534        and False if the column does not exist in the table.
1535        """
1536
1537        # Find column infos
1538        if column:
1539            if isinstance(column, dict):
1540                table_name = column.get("table_name", None)
1541                column_name = column.get("column_name", None)
1542            elif isinstance(column, str):
1543                table_name = self.get_table_variants()
1544                column_name = column
1545            else:
1546                table_name = None
1547                column_name = None
1548
1549        if not table_name and not column_name:
1550            return False
1551
1552        # Removed
1553        removed = False
1554
1555        # Check if the column already exists in the table
1556        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1557        columns = self.get_query_to_df(query).columns.tolist()
1558        if column_name in columns:
1559            log.debug(f"The {column_name} column exists in the {table_name} table")
1560        else:
1561            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1562            return False
1563
1564        # Add column in table # ALTER TABLE integers DROP k
1565        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1566        self.execute_query(add_column_query)
1567        removed = True
1568        log.debug(
1569            f"The {column_name} column was successfully dropped to the {table_name} table"
1570        )
1571
1572        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False) -> list:
1574    def explode_infos(
1575        self,
1576        prefix: str = None,
1577        create_index: bool = False,
1578        fields: list = None,
1579        force: bool = False,
1580        proccess_all_fields_together: bool = False,
1581    ) -> list:
1582        """
1583        The `explode_infos` function takes a VCF file and explodes the INFO fields into individual
1584        columns, returning a list of added columns.
1585
1586        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1587        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1588        `self.get_explode_infos_prefix()` as the prefix
1589        :type prefix: str
1590        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1591        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1592        `False`, indexes will not be created. The default value is `False`, defaults to False
1593        :type create_index: bool (optional)
1594        :param fields: The `fields` parameter is a list of INFO fields that you want to explode into
1595        individual columns. If this parameter is not provided, all INFO fields will be exploded
1596        :type fields: list
1597        :param force: The `force` parameter is a boolean flag that determines whether to drop and
1598        recreate the column if it already exists in the table. If `force` is set to `True`, the column
1599        will be dropped and recreated. If `force` is set to `False`, the column will not be dropped,
1600        defaults to False
1601        :type force: bool (optional)
1602        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1603        flag that determines whether to process all the INFO fields together or individually. If set to
1604        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1605        be processed individually, defaults to False
1606        :type proccess_all_fields_together: bool (optional)
1607        :return: The function `explode_infos` returns a list of added columns.
1608        """
1609
1610        # drop indexes
1611        self.drop_indexes()
1612
1613        # connexion format
1614        connexion_format = self.get_connexion_format()
1615
1616        # Access
1617        access = self.get_config().get("access", None)
1618
1619        # Added columns
1620        added_columns = []
1621
1622        if access not in ["RO"]:
1623
1624            # prefix
1625            if prefix in [None, True] or not isinstance(prefix, str):
1626                if self.get_explode_infos_prefix() not in [None, True]:
1627                    prefix = self.get_explode_infos_prefix()
1628                else:
1629                    prefix = "INFO/"
1630
1631            # table variants
1632            table_variants = self.get_table_variants(clause="select")
1633
1634            # extra infos
1635            try:
1636                extra_infos = self.get_extra_infos()
1637            except:
1638                extra_infos = []
1639
1640            # Header infos
1641            header_infos = self.get_header().infos
1642
1643            log.debug(
1644                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1645            )
1646
1647            sql_info_alter_table_array = []
1648
1649            # Info fields to check
1650            fields_list = list(header_infos)
1651            if fields:
1652                fields_list += fields
1653            fields_list = set(fields_list)
1654
1655            # If no fields
1656            if not fields:
1657                fields = []
1658
1659            # Translate fields if patterns
1660            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1661
1662            for info in fields:
1663
1664                info_id_sql = prefix + info
1665
1666                if (
1667                    info in fields_list
1668                    or prefix + info in fields_list
1669                    or info in extra_infos
1670                ):
1671
1672                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1673
1674                    if info in header_infos:
1675                        info_type = header_infos[info].type
1676                        info_num = header_infos[info].num
1677                    else:
1678                        info_type = "String"
1679                        info_num = 0
1680
1681                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1682                    if info_num != 1:
1683                        type_sql = "VARCHAR"
1684
1685                    # Add field
1686                    added_column = self.add_column(
1687                        table_name=table_variants,
1688                        column_name=info_id_sql,
1689                        column_type=type_sql,
1690                        default_value="null",
1691                        drop=force,
1692                    )
1693
1694                    if added_column:
1695                        added_columns.append(added_column)
1696
1697                    if added_column or force:
1698
1699                        # add field to index
1700                        self.index_additionnal_fields.append(info_id_sql)
1701
1702                        # Update field array
1703                        if connexion_format in ["duckdb"]:
1704                            update_info_field = f"""
1705                            "{info_id_sql}" =
1706                                CASE
1707                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1708                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1709                                END
1710                            """
1711                        elif connexion_format in ["sqlite"]:
1712                            update_info_field = f"""
1713                                "{info_id_sql}" =
1714                                    CASE
1715                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1716                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1717                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1718                                    END
1719                            """
1720
1721                        sql_info_alter_table_array.append(update_info_field)
1722
1723            if sql_info_alter_table_array:
1724
1725                # By chromosomes
1726                try:
1727                    chromosomes_list = list(
1728                        self.get_query_to_df(
1729                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1730                        )["#CHROM"]
1731                    )
1732                except:
1733                    chromosomes_list = [None]
1734
1735                for chrom in chromosomes_list:
1736                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1737
1738                    # Where clause
1739                    where_clause = ""
1740                    if chrom and len(chromosomes_list) > 1:
1741                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1742
1743                    # Update table
1744                    if proccess_all_fields_together:
1745                        sql_info_alter_table_array_join = ", ".join(
1746                            sql_info_alter_table_array
1747                        )
1748                        if sql_info_alter_table_array_join:
1749                            sql_info_alter_table = f"""
1750                                UPDATE {table_variants}
1751                                SET {sql_info_alter_table_array_join}
1752                                {where_clause}
1753                                """
1754                            log.debug(
1755                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1756                            )
1757                            # log.debug(sql_info_alter_table)
1758                            self.conn.execute(sql_info_alter_table)
1759                    else:
1760                        sql_info_alter_num = 0
1761                        for sql_info_alter in sql_info_alter_table_array:
1762                            sql_info_alter_num += 1
1763                            sql_info_alter_table = f"""
1764                                UPDATE {table_variants}
1765                                SET {sql_info_alter}
1766                                {where_clause}
1767                                """
1768                            log.debug(
1769                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1770                            )
1771                            # log.debug(sql_info_alter_table)
1772                            self.conn.execute(sql_info_alter_table)
1773
1774        # create indexes
1775        if create_index:
1776            self.create_indexes()
1777
1778        return added_columns

The explode_infos function takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded
  • force: The force parameter is a boolean flag that determines whether to drop and recreate the column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to False, the column will not be dropped, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually, defaults to False
Returns

The function explode_infos returns a list of added columns.

def create_indexes(self) -> None:
1780    def create_indexes(self) -> None:
1781        """
1782        Create indexes on the table after insertion
1783        """
1784
1785        # Access
1786        access = self.get_config().get("access", None)
1787
1788        # get table variants
1789        table_variants = self.get_table_variants("FROM")
1790
1791        if self.get_indexing() and access not in ["RO"]:
1792            # Create index
1793            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1794            self.conn.execute(sql_create_table_index)
1795            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1796            self.conn.execute(sql_create_table_index)
1797            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1798            self.conn.execute(sql_create_table_index)
1799            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1800            self.conn.execute(sql_create_table_index)
1801            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1802            self.conn.execute(sql_create_table_index)
1803            for field in self.index_additionnal_fields:
1804                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1805                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
1807    def drop_indexes(self) -> None:
1808        """
1809        Create indexes on the table after insertion
1810        """
1811
1812        # Access
1813        access = self.get_config().get("access", None)
1814
1815        # get table variants
1816        table_variants = self.get_table_variants("FROM")
1817
1818        # Get database format
1819        connexion_format = self.get_connexion_format()
1820
1821        if access not in ["RO"]:
1822            if connexion_format in ["duckdb"]:
1823                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1824            elif connexion_format in ["sqlite"]:
1825                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1826
1827            list_indexes = self.conn.execute(sql_list_indexes)
1828            index_names = [row[0] for row in list_indexes.fetchall()]
1829            for index in index_names:
1830                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1831                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
1833    def read_vcf_header(self, f) -> list:
1834        """
1835        It reads the header of a VCF file and returns a list of the header lines
1836
1837        :param f: the file object
1838        :return: The header lines of the VCF file.
1839        """
1840
1841        header_list = []
1842        for line in f:
1843            header_list.append(line)
1844            if line.startswith("#CHROM"):
1845                break
1846        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
1848    def read_vcf_header_file(self, file: str = None) -> list:
1849        """
1850        The function `read_vcf_header_file` reads the header of a VCF file, either from a compressed or
1851        uncompressed file.
1852
1853        :param file: The `file` parameter is a string that represents the path to the VCF header file
1854        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1855        default to `None`
1856        :type file: str
1857        :param compressed: The `compressed` parameter is a boolean flag that indicates whether the VCF
1858        file is compressed or not. If `compressed` is set to `True`, it means that the VCF file is
1859        compressed using the BGZF compression format. If `compressed` is set to `False`, it means that,
1860        defaults to False
1861        :type compressed: bool (optional)
1862        :return: a list.
1863        """
1864
1865        if self.get_input_compressed(input_file=file):
1866            with bgzf.open(file, "rt") as f:
1867                return self.read_vcf_header(f=f)
1868        else:
1869            with open(file, "rt") as f:
1870                return self.read_vcf_header(f=f)

The function read_vcf_header_file reads the header of a VCF file, either from a compressed or uncompressed file.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
  • compressed: The compressed parameter is a boolean flag that indicates whether the VCF file is compressed or not. If compressed is set to True, it means that the VCF file is compressed using the BGZF compression format. If compressed is set to False, it means that, defaults to False
Returns

a list.

def execute_query(self, query: str):
1872    def execute_query(self, query: str):
1873        """
1874        It takes a query as an argument, executes it, and returns the results
1875
1876        :param query: The query to be executed
1877        :return: The result of the query is being returned.
1878        """
1879        if query:
1880            return self.conn.execute(query)  # .fetchall()
1881        else:
1882            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
1884    def export_output(
1885        self,
1886        output_file: str | None = None,
1887        output_header: str | None = None,
1888        export_header: bool = True,
1889        query: str | None = None,
1890        parquet_partitions: list | None = None,
1891        chunk_size: int | None = None,
1892        threads: int | None = None,
1893        sort: bool = False,
1894        index: bool = False,
1895        order_by: str | None = None,
1896    ) -> bool:
1897        """
1898        The `export_output` function exports data from a VCF file to a specified output file in various
1899        formats, including VCF, CSV, TSV, PSV, and Parquet.
1900
1901        :param output_file: The `output_file` parameter is a string that specifies the name of the
1902        output file to be generated by the function. This is where the exported data will be saved
1903        :type output_file: str
1904        :param output_header: The `output_header` parameter is a string that specifies the name of the
1905        file where the header of the VCF file will be exported. If this parameter is not provided, the
1906        header will be exported to a file with the same name as the `output_file` parameter, but with
1907        the extension "
1908        :type output_header: str
1909        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1910        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1911        True, the header will be exported to a file. If `export_header` is False, the header will not
1912        be, defaults to True, if output format is not VCF
1913        :type export_header: bool (optional)
1914        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1915        select specific data from the VCF file before exporting it. If provided, only the data that
1916        matches the query will be exported
1917        :type query: str
1918        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1919        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1920        organize data in a hierarchical directory structure based on the values of one or more columns.
1921        This can improve query performance when working with large datasets
1922        :type parquet_partitions: list
1923        :param chunk_size: The `chunk_size` parameter specifies the number of
1924        records in batch when exporting data in Parquet format. This parameter is used for
1925        partitioning the Parquet file into multiple files.
1926        :type chunk_size: int
1927        :param threads: The `threads` parameter is an optional parameter that specifies the number of
1928        threads to be used during the export process. It determines the level of parallelism and can
1929        improve the performance of the export operation. If not provided, the function will use the
1930        default number of threads
1931        :type threads: int
1932        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
1933        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
1934        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
1935        False
1936        :type sort: bool (optional)
1937        :param index: The `index` parameter is a boolean flag that determines whether an index should be
1938        created on the output file. If `index` is True, an index will be created. If `index` is False,
1939        no index will be created. The default value is False, defaults to False
1940        :type index: bool (optional)
1941        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
1942        sorting the output file. This parameter is only applicable when exporting data in VCF format
1943        :type order_by: str
1944        :return: a boolean value. It checks if the output file exists and returns True if it does, or
1945        None if it doesn't.
1946        """
1947
1948        # Log
1949        log.info("Exporting...")
1950
1951        # Full path
1952        output_file = full_path(output_file)
1953        output_header = full_path(output_header)
1954
1955        # Config
1956        config = self.get_config()
1957
1958        # Param
1959        param = self.get_param()
1960
1961        # Tmp files to remove
1962        tmp_to_remove = []
1963
1964        # If no output, get it
1965        if not output_file:
1966            output_file = self.get_output()
1967
1968        # If not threads
1969        if not threads:
1970            threads = self.get_threads()
1971
1972        # Auto header name with extension
1973        if export_header or output_header:
1974            if not output_header:
1975                output_header = f"{output_file}.hdr"
1976            # Export header
1977            self.export_header(output_file=output_file)
1978
1979        # Switch off export header if VCF output
1980        output_file_type = get_file_format(output_file)
1981        if output_file_type in ["vcf"]:
1982            export_header = False
1983            tmp_to_remove.append(output_header)
1984
1985        # Chunk size
1986        if not chunk_size:
1987            chunk_size = config.get("chunk_size", None)
1988
1989        # Parquet partition
1990        if not parquet_partitions:
1991            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
1992        if parquet_partitions and isinstance(parquet_partitions, str):
1993            parquet_partitions = parquet_partitions.split(",")
1994
1995        # Order by
1996        if not order_by:
1997            order_by = param.get("export", {}).get("order_by", "")
1998
1999        # Header in output
2000        header_in_output = param.get("export", {}).get("include_header", False)
2001
2002        # Database
2003        database_source = self.get_connexion()
2004
2005        # Connexion format
2006        connexion_format = self.get_connexion_format()
2007
2008        # Explode infos
2009        if self.get_explode_infos():
2010            self.explode_infos(
2011                prefix=self.get_explode_infos_prefix(),
2012                fields=self.get_explode_infos_fields(),
2013                force=False,
2014            )
2015
2016        # if connexion_format in ["sqlite"] or query:
2017        if connexion_format in ["sqlite"]:
2018
2019            # Export in Parquet
2020            random_tmp = "".join(
2021                random.choice(string.ascii_lowercase) for i in range(10)
2022            )
2023            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2024            tmp_to_remove.append(database_source)
2025
2026            # Table Variants
2027            table_variants = self.get_table_variants()
2028
2029            # Create export query
2030            sql_query_export_subquery = f"""
2031                SELECT * FROM {table_variants}
2032                """
2033
2034            # Write source file
2035            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2036
2037        # Create database
2038        database = Database(
2039            database=database_source,
2040            table="variants",
2041            header_file=output_header,
2042            conn_config=self.get_connexion_config(),
2043        )
2044
2045        # Existing colomns header
2046        # existing_columns_header = database.get_header_file_columns(output_header)
2047        existing_columns_header = database.get_header_columns_from_database()
2048
2049        # Export file
2050        database.export(
2051            output_database=output_file,
2052            output_header=output_header,
2053            existing_columns_header=existing_columns_header,
2054            parquet_partitions=parquet_partitions,
2055            chunk_size=chunk_size,
2056            threads=threads,
2057            sort=sort,
2058            index=index,
2059            header_in_output=header_in_output,
2060            order_by=order_by,
2061            query=query,
2062            export_header=export_header,
2063        )
2064
2065        # Remove
2066        remove_if_exists(tmp_to_remove)
2067
2068        return (os.path.exists(output_file) or None) and (
2069            os.path.exists(output_file) or None
2070        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2072    def get_extra_infos(self, table: str = None) -> list:
2073        """
2074        > This function returns a list of columns that are in the table but not in the header
2075
2076        The function is called `get_extra_infos` and it takes two arguments: `self` and `table`. The
2077        `self` argument is a reference to the object that called the function. The `table` argument is
2078        the name of the table that we want to get the extra columns from
2079
2080        :param table: The table to get the extra columns from. If not specified, it will use the
2081        variants table
2082        :param format: The format of the output. If it's "sql", it will return a string of the extra
2083        columns separated by commas. If it's "list", it will return a list of the extra columns
2084        :return: A list of columns that are in the table but not in the header
2085        """
2086
2087        header_columns = []
2088
2089        if not table:
2090            table = self.get_table_variants(clause="from")
2091            header_columns = self.get_header_columns()
2092
2093        # Check all columns in the database
2094        query = f""" SELECT * FROM {table} LIMIT 1 """
2095        log.debug(f"query {query}")
2096        table_columns = self.get_query_to_df(query).columns.tolist()
2097        extra_columns = []
2098
2099        # Construct extra infos (not in header)
2100        for column in table_columns:
2101            if column not in header_columns:
2102                extra_columns.append(column)
2103
2104        return extra_columns

This function returns a list of columns that are in the table but not in the header

The function is called get_extra_infos and it takes two arguments: self and table. The self argument is a reference to the object that called the function. The table argument is the name of the table that we want to get the extra columns from

Parameters
  • table: The table to get the extra columns from. If not specified, it will use the variants table
  • format: The format of the output. If it's "sql", it will return a string of the extra columns separated by commas. If it's "list", it will return a list of the extra columns
Returns

A list of columns that are in the table but not in the header

def get_extra_infos_sql(self, table: str = None) -> str:
2106    def get_extra_infos_sql(self, table: str = None) -> str:
2107        """
2108        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2109        by double quotes
2110
2111        :param table: The name of the table to get the extra infos from. If None, the default table is
2112        used
2113        :type table: str
2114        :return: A string of the extra infos
2115        """
2116
2117        return ", ".join(
2118            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2119        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2121    def export_header(
2122        self,
2123        header_name: str = None,
2124        output_file: str = None,
2125        output_file_ext: str = ".hdr",
2126        clean_header: bool = True,
2127        remove_chrom_line: bool = False,
2128    ) -> str:
2129        """
2130        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2131        specified options, and writes it to a new file.
2132
2133        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2134        this parameter is not specified, the header will be written to the output file
2135        :type header_name: str
2136        :param output_file: The `output_file` parameter in the `export_header` function is used to
2137        specify the name of the output file where the header will be written. If this parameter is not
2138        provided, the header will be written to a temporary file
2139        :type output_file: str
2140        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2141        string that represents the extension of the output header file. By default, it is set to ".hdr"
2142        if not specified by the user. This extension will be appended to the `output_file` name to
2143        create the final, defaults to .hdr
2144        :type output_file_ext: str (optional)
2145        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2146        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2147        `True`, the function will clean the header by modifying certain lines based on a specific
2148        pattern. If `clean_header`, defaults to True
2149        :type clean_header: bool (optional)
2150        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2151        boolean flag that determines whether the #CHROM line should be removed from the header before
2152        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2153        defaults to False
2154        :type remove_chrom_line: bool (optional)
2155        :return: The function `export_header` returns the name of the temporary header file that is
2156        created.
2157        """
2158
2159        if not header_name and not output_file:
2160            output_file = self.get_output()
2161
2162        if self.get_header():
2163
2164            # Get header object
2165            header_obj = self.get_header()
2166
2167            # Create database
2168            db_for_header = Database(database=self.get_input())
2169
2170            # Get real columns in the file
2171            db_header_columns = db_for_header.get_columns()
2172
2173            with tempfile.TemporaryDirectory() as tmpdir:
2174
2175                # Write header file
2176                header_file_tmp = os.path.join(tmpdir, "header")
2177                f = open(header_file_tmp, "w")
2178                vcf.Writer(f, header_obj)
2179                f.close()
2180
2181                # Replace #CHROM line with rel columns
2182                header_list = db_for_header.read_header_file(
2183                    header_file=header_file_tmp
2184                )
2185                header_list[-1] = "\t".join(db_header_columns)
2186
2187                # Remove CHROM line
2188                if remove_chrom_line:
2189                    header_list.pop()
2190
2191                # Clean header
2192                if clean_header:
2193                    header_list_clean = []
2194                    for head in header_list:
2195                        # Clean head for malformed header
2196                        head_clean = head
2197                        head_clean = re.subn(
2198                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2199                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2200                            head_clean,
2201                            2,
2202                        )[0]
2203                        # Write header
2204                        header_list_clean.append(head_clean)
2205                    header_list = header_list_clean
2206
2207            tmp_header_name = output_file + output_file_ext
2208
2209            f = open(tmp_header_name, "w")
2210            for line in header_list:
2211                f.write(line)
2212            f.close()
2213
2214        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], index: bool = False, threads: int | None = None) -> bool | None:
2216    def export_variant_vcf(
2217        self,
2218        vcf_file,
2219        remove_info: bool = False,
2220        add_samples: bool = True,
2221        list_samples: list = [],
2222        index: bool = False,
2223        threads: int | None = None,
2224    ) -> bool | None:
2225        """
2226        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2227        remove INFO field, add samples, and control compression and indexing.
2228
2229        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2230        written to. It is the output file that will contain the filtered VCF data based on the specified
2231        parameters
2232        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2233        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2234        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2235        in, defaults to False
2236        :type remove_info: bool (optional)
2237        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2238        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2239        If set to False, the samples will be removed. The default value is True, defaults to True
2240        :type add_samples: bool (optional)
2241        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2242        in the output VCF file. By default, all samples will be included. If you provide a list of
2243        samples, only those samples will be included in the output file
2244        :type list_samples: list
2245        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2246        determines whether or not to create an index for the output VCF file. If `index` is set to
2247        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2248        :type index: bool (optional)
2249        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2250        number of threads to use for exporting the VCF file. It determines how many parallel threads
2251        will be used during the export process. More threads can potentially speed up the export process
2252        by utilizing multiple cores of the processor. If
2253        :type threads: int | None
2254        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2255        method with various parameters including the output file, query, threads, sort flag, and index
2256        flag. The `export_output` method is responsible for exporting the VCF data based on the
2257        specified parameters and configurations provided in the `export_variant_vcf` function.
2258        """
2259
2260        # Config
2261        config = self.get_config()
2262
2263        # Extract VCF
2264        log.debug("Export VCF...")
2265
2266        # Table variants
2267        table_variants = self.get_table_variants()
2268
2269        # Threads
2270        if not threads:
2271            threads = self.get_threads()
2272
2273        # Info fields
2274        if remove_info:
2275            if not isinstance(remove_info, str):
2276                remove_info = "."
2277            info_field = f"""'{remove_info}' as INFO"""
2278        else:
2279            info_field = "INFO"
2280
2281        # Samples fields
2282        if add_samples:
2283            if not list_samples:
2284                list_samples = self.get_header_sample_list()
2285            if list_samples:
2286                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2287            else:
2288                samples_fields = ""
2289            log.debug(f"samples_fields: {samples_fields}")
2290        else:
2291            samples_fields = ""
2292
2293        # Variants
2294        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2295        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} """
2296
2297        return self.export_output(
2298            output_file=vcf_file,
2299            output_header=None,
2300            export_header=True,
2301            query=sql_query_select,
2302            parquet_partitions=None,
2303            chunk_size=config.get("chunk_size", None),
2304            threads=threads,
2305            sort=True,
2306            index=index,
2307            order_by=None,
2308        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2310    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2311        """
2312        It takes a list of commands and runs them in parallel using the number of threads specified
2313
2314        :param commands: A list of commands to run
2315        :param threads: The number of threads to use, defaults to 1 (optional)
2316        """
2317
2318        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2320    def get_threads(self, default: int = 1) -> int:
2321        """
2322        This function returns the number of threads to use for a job, with a default value of 1 if not
2323        specified.
2324
2325        :param default: The `default` parameter in the `get_threads` method is used to specify the
2326        default number of threads to use if no specific value is provided. If no value is provided for
2327        the `threads` parameter in the configuration or input parameters, the `default` value will be
2328        used, defaults to 1
2329        :type default: int (optional)
2330        :return: the number of threads to use for the current job.
2331        """
2332
2333        # Config
2334        config = self.get_config()
2335
2336        # Param
2337        param = self.get_param()
2338
2339        # Input threads
2340        input_thread = param.get("threads", config.get("threads", None))
2341
2342        # Check threads
2343        if not input_thread:
2344            threads = default
2345        elif int(input_thread) <= 0:
2346            threads = os.cpu_count()
2347        else:
2348            threads = int(input_thread)
2349        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2351    def get_memory(self, default: str = None) -> str:
2352        """
2353        This function retrieves the memory value from parameters or configuration with a default value
2354        if not found.
2355
2356        :param default: The `get_memory` function takes in a default value as a string parameter. This
2357        default value is used as a fallback in case the `memory` parameter is not provided in the
2358        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2359        the function
2360        :type default: str
2361        :return: The `get_memory` function returns a string value representing the memory parameter. If
2362        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2363        return the default value provided as an argument to the function.
2364        """
2365
2366        # Config
2367        config = self.get_config()
2368
2369        # Param
2370        param = self.get_param()
2371
2372        # Input threads
2373        input_memory = param.get("memory", config.get("memory", None))
2374
2375        # Check threads
2376        if input_memory:
2377            memory = input_memory
2378        else:
2379            memory = default
2380
2381        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2383    def update_from_vcf(self, vcf_file: str) -> None:
2384        """
2385        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2386
2387        :param vcf_file: the path to the VCF file
2388        """
2389
2390        connexion_format = self.get_connexion_format()
2391
2392        if connexion_format in ["duckdb"]:
2393            self.update_from_vcf_duckdb(vcf_file)
2394        elif connexion_format in ["sqlite"]:
2395            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2397    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2398        """
2399        It takes a VCF file and updates the INFO column of the variants table in the database with the
2400        INFO column of the VCF file
2401
2402        :param vcf_file: the path to the VCF file
2403        """
2404
2405        # varaints table
2406        table_variants = self.get_table_variants()
2407
2408        # Loading VCF into temporaire table
2409        skip = self.get_header_length(file=vcf_file)
2410        vcf_df = pd.read_csv(
2411            vcf_file,
2412            sep="\t",
2413            engine="c",
2414            skiprows=skip,
2415            header=0,
2416            low_memory=False,
2417        )
2418        sql_query_update = f"""
2419        UPDATE {table_variants} as table_variants
2420            SET INFO = concat(
2421                            CASE
2422                                WHEN INFO NOT IN ('', '.')
2423                                THEN INFO
2424                                ELSE ''
2425                            END,
2426                            (
2427                                SELECT 
2428                                    concat(
2429                                        CASE
2430                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2431                                            THEN ';'
2432                                            ELSE ''
2433                                        END
2434                                        ,
2435                                        CASE
2436                                            WHEN table_parquet.INFO NOT IN ('','.')
2437                                            THEN table_parquet.INFO
2438                                            ELSE ''
2439                                        END
2440                                    )
2441                                FROM vcf_df as table_parquet
2442                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2443                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2444                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2445                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2446                                        AND table_parquet.INFO NOT IN ('','.')
2447                            )
2448                        )
2449            ;
2450            """
2451        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2453    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2454        """
2455        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2456        table, then updates the INFO column of the variants table with the INFO column of the temporary
2457        table
2458
2459        :param vcf_file: The path to the VCF file you want to update the database with
2460        """
2461
2462        # Create a temporary table for the VCF
2463        table_vcf = "tmp_vcf"
2464        sql_create = (
2465            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2466        )
2467        self.conn.execute(sql_create)
2468
2469        # Loading VCF into temporaire table
2470        vcf_df = pd.read_csv(
2471            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2472        )
2473        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2474        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2475
2476        # Update table 'variants' with VCF data
2477        # warning: CONCAT as || operator
2478        sql_query_update = f"""
2479            UPDATE variants as table_variants
2480            SET INFO = CASE
2481                            WHEN INFO NOT IN ('', '.')
2482                            THEN INFO
2483                            ELSE ''
2484                        END ||
2485                        (
2486                        SELECT 
2487                            CASE 
2488                                WHEN table_variants.INFO NOT IN ('','.') 
2489                                    AND table_vcf.INFO NOT IN ('','.')  
2490                                THEN ';' 
2491                                ELSE '' 
2492                            END || 
2493                            CASE 
2494                                WHEN table_vcf.INFO NOT IN ('','.') 
2495                                THEN table_vcf.INFO 
2496                                ELSE '' 
2497                            END
2498                        FROM {table_vcf} as table_vcf
2499                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2500                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2501                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2502                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2503                        )
2504        """
2505        self.conn.execute(sql_query_update)
2506
2507        # Drop temporary table
2508        sql_drop = f"DROP TABLE {table_vcf}"
2509        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2511    def drop_variants_table(self) -> None:
2512        """
2513        > This function drops the variants table
2514        """
2515
2516        table_variants = self.get_table_variants()
2517        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2518        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2520    def set_variant_id(
2521        self, variant_id_column: str = "variant_id", force: bool = None
2522    ) -> str:
2523        """
2524        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2525        `#CHROM`, `POS`, `REF`, and `ALT` columns
2526
2527        :param variant_id_column: The name of the column to be created in the variants table, defaults
2528        to variant_id
2529        :type variant_id_column: str (optional)
2530        :param force: If True, the variant_id column will be created even if it already exists
2531        :type force: bool
2532        :return: The name of the column that contains the variant_id
2533        """
2534
2535        # Assembly
2536        assembly = self.get_param().get(
2537            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2538        )
2539
2540        # INFO/Tag prefix
2541        prefix = self.get_explode_infos_prefix()
2542
2543        # Explode INFO/SVTYPE
2544        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2545
2546        # variants table
2547        table_variants = self.get_table_variants()
2548
2549        # variant_id column
2550        if not variant_id_column:
2551            variant_id_column = "variant_id"
2552
2553        # Creta variant_id column
2554        if "variant_id" not in self.get_extra_infos() or force:
2555
2556            # Create column
2557            self.add_column(
2558                table_name=table_variants,
2559                column_name=variant_id_column,
2560                column_type="UBIGINT",
2561                default_value="0",
2562            )
2563
2564            # Update column
2565            self.conn.execute(
2566                f"""
2567                    UPDATE {table_variants}
2568                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2569                """
2570            )
2571
2572        # Remove added columns
2573        for added_column in added_columns:
2574            self.drop_column(column=added_column)
2575
2576        # return variant_id column name
2577        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2579    def get_variant_id_column(
2580        self, variant_id_column: str = "variant_id", force: bool = None
2581    ) -> str:
2582        """
2583        This function returns the variant_id column name
2584
2585        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2586        defaults to variant_id
2587        :type variant_id_column: str (optional)
2588        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2589        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2590        if it is not already set, or if it is set
2591        :type force: bool
2592        :return: The variant_id column name.
2593        """
2594
2595        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list['parquet'], database_releases: list = ['current']) -> dict:
2601    def scan_databases(
2602        self, database_formats: list["parquet"], database_releases: list = ["current"]
2603    ) -> dict:
2604        """
2605        The function `scan_databases` scans for available databases based on specified formats and
2606        releases.
2607
2608        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2609        of the databases to be scanned. In this case, the accepted format is "parquet"
2610        :type database_formats: list ["parquet"]
2611        :param database_releases: The `database_releases` parameter is a list that specifies the
2612        releases of the databases to be scanned. In the provided function, the default value for
2613        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2614        databases that are in the "current"
2615        :type database_releases: list
2616        :return: The function `scan_databases` returns a dictionary containing information about
2617        databases that match the specified formats and releases.
2618        """
2619
2620        # Config
2621        config = self.get_config()
2622
2623        # Param
2624        param = self.get_param()
2625
2626        # Param - Assembly
2627        assembly = param.get("assembly", config.get("assembly", None))
2628        if not assembly:
2629            assembly = DEFAULT_ASSEMBLY
2630            log.warning(f"Default assembly '{assembly}'")
2631
2632        # Scan for availabled databases
2633        log.info(
2634            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2635        )
2636        databases_infos_dict = databases_infos(
2637            database_folder_releases=database_releases,
2638            database_formats=database_formats,
2639            assembly=assembly,
2640            config=config,
2641        )
2642        log.info(
2643            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2644        )
2645
2646        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2648    def annotation(self) -> None:
2649        """
2650        It annotates the VCF file with the annotations specified in the config file.
2651        """
2652
2653        # Config
2654        config = self.get_config()
2655
2656        # Param
2657        param = self.get_param()
2658
2659        # Param - Assembly
2660        assembly = param.get("assembly", config.get("assembly", None))
2661        if not assembly:
2662            assembly = DEFAULT_ASSEMBLY
2663            log.warning(f"Default assembly '{assembly}'")
2664
2665        # annotations databases folders
2666        annotations_databases = set(
2667            config.get("folders", {})
2668            .get("databases", {})
2669            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2670            + config.get("folders", {})
2671            .get("databases", {})
2672            .get("parquet", ["~/howard/databases/parquet/current"])
2673            + config.get("folders", {})
2674            .get("databases", {})
2675            .get("bcftools", ["~/howard/databases/bcftools/current"])
2676        )
2677
2678        # Get param annotations
2679        if param.get("annotations", None) and isinstance(
2680            param.get("annotations", None), str
2681        ):
2682            log.debug(param.get("annotations", None))
2683            param_annotation_list = param.get("annotations").split(",")
2684        else:
2685            param_annotation_list = []
2686
2687        # Each tools param
2688        if param.get("annotation_parquet", None) != None:
2689            log.debug(
2690                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2691            )
2692            if isinstance(param.get("annotation_parquet", None), list):
2693                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2694            else:
2695                param_annotation_list.append(param.get("annotation_parquet"))
2696        if param.get("annotation_snpsift", None) != None:
2697            if isinstance(param.get("annotation_snpsift", None), list):
2698                param_annotation_list.append(
2699                    "snpsift:"
2700                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2701                )
2702            else:
2703                param_annotation_list.append(
2704                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2705                )
2706        if param.get("annotation_snpeff", None) != None:
2707            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2708        if param.get("annotation_bcftools", None) != None:
2709            if isinstance(param.get("annotation_bcftools", None), list):
2710                param_annotation_list.append(
2711                    "bcftools:"
2712                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2713                )
2714            else:
2715                param_annotation_list.append(
2716                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2717                )
2718        if param.get("annotation_annovar", None) != None:
2719            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2720        if param.get("annotation_exomiser", None) != None:
2721            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2722        if param.get("annotation_splice", None) != None:
2723            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2724
2725        # Merge param annotations list
2726        param["annotations"] = ",".join(param_annotation_list)
2727
2728        # debug
2729        log.debug(f"param_annotations={param['annotations']}")
2730
2731        if param.get("annotations"):
2732
2733            # Log
2734            # log.info("Annotations - Check annotation parameters")
2735
2736            if not "annotation" in param:
2737                param["annotation"] = {}
2738
2739            # List of annotations parameters
2740            annotations_list_input = {}
2741            if isinstance(param.get("annotations", None), str):
2742                annotation_file_list = [
2743                    value for value in param.get("annotations", "").split(",")
2744                ]
2745                for annotation_file in annotation_file_list:
2746                    annotations_list_input[annotation_file] = {"INFO": None}
2747            else:
2748                annotations_list_input = param.get("annotations", {})
2749
2750            log.info(f"Quick Annotations:")
2751            for annotation_key in list(annotations_list_input.keys()):
2752                log.info(f"   {annotation_key}")
2753
2754            # List of annotations and associated fields
2755            annotations_list = {}
2756
2757            for annotation_file in annotations_list_input:
2758
2759                # Explode annotations if ALL
2760                if (
2761                    annotation_file.upper() == "ALL"
2762                    or annotation_file.upper().startswith("ALL:")
2763                ):
2764
2765                    # check ALL parameters (formats, releases)
2766                    annotation_file_split = annotation_file.split(":")
2767                    database_formats = "parquet"
2768                    database_releases = "current"
2769                    for annotation_file_option in annotation_file_split[1:]:
2770                        database_all_options_split = annotation_file_option.split("=")
2771                        if database_all_options_split[0] == "format":
2772                            database_formats = database_all_options_split[1].split("+")
2773                        if database_all_options_split[0] == "release":
2774                            database_releases = database_all_options_split[1].split("+")
2775
2776                    # Scan for availabled databases
2777                    databases_infos_dict = self.scan_databases(
2778                        database_formats=database_formats,
2779                        database_releases=database_releases,
2780                    )
2781
2782                    # Add found databases in annotation parameters
2783                    for database_infos in databases_infos_dict.keys():
2784                        annotations_list[database_infos] = {"INFO": None}
2785
2786                else:
2787                    annotations_list[annotation_file] = annotations_list_input[
2788                        annotation_file
2789                    ]
2790
2791            # Check each databases
2792            if len(annotations_list):
2793
2794                log.info(
2795                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2796                )
2797
2798                for annotation_file in annotations_list:
2799
2800                    # Init
2801                    annotations = annotations_list.get(annotation_file, None)
2802
2803                    # Annotation snpEff
2804                    if annotation_file.startswith("snpeff"):
2805
2806                        log.debug(f"Quick Annotation snpEff")
2807
2808                        if "snpeff" not in param["annotation"]:
2809                            param["annotation"]["snpeff"] = {}
2810
2811                        if "options" not in param["annotation"]["snpeff"]:
2812                            param["annotation"]["snpeff"]["options"] = ""
2813
2814                        # snpEff options in annotations
2815                        param["annotation"]["snpeff"]["options"] = "".join(
2816                            annotation_file.split(":")[1:]
2817                        )
2818
2819                    # Annotation Annovar
2820                    elif annotation_file.startswith("annovar"):
2821
2822                        log.debug(f"Quick Annotation Annovar")
2823
2824                        if "annovar" not in param["annotation"]:
2825                            param["annotation"]["annovar"] = {}
2826
2827                        if "annotations" not in param["annotation"]["annovar"]:
2828                            param["annotation"]["annovar"]["annotations"] = {}
2829
2830                        # Options
2831                        annotation_file_split = annotation_file.split(":")
2832                        for annotation_file_annotation in annotation_file_split[1:]:
2833                            if annotation_file_annotation:
2834                                param["annotation"]["annovar"]["annotations"][
2835                                    annotation_file_annotation
2836                                ] = annotations
2837
2838                    # Annotation Exomiser
2839                    elif annotation_file.startswith("exomiser"):
2840
2841                        log.debug(f"Quick Annotation Exomiser")
2842
2843                        param["annotation"]["exomiser"] = params_string_to_dict(
2844                            annotation_file
2845                        )
2846
2847                    # Annotation Splice
2848                    elif annotation_file.startswith("splice"):
2849
2850                        log.debug(f"Quick Annotation Splice")
2851
2852                        param["annotation"]["splice"] = params_string_to_dict(
2853                            annotation_file
2854                        )
2855
2856                    # Annotation Parquet or BCFTOOLS
2857                    else:
2858
2859                        # Tools detection
2860                        if annotation_file.startswith("bcftools:"):
2861                            annotation_tool_initial = "bcftools"
2862                            annotation_file = ":".join(annotation_file.split(":")[1:])
2863                        elif annotation_file.startswith("snpsift:"):
2864                            annotation_tool_initial = "snpsift"
2865                            annotation_file = ":".join(annotation_file.split(":")[1:])
2866                        else:
2867                            annotation_tool_initial = None
2868
2869                        # list of files
2870                        annotation_file_list = annotation_file.replace("+", ":").split(
2871                            ":"
2872                        )
2873
2874                        for annotation_file in annotation_file_list:
2875
2876                            if annotation_file:
2877
2878                                # Annotation tool initial
2879                                annotation_tool = annotation_tool_initial
2880
2881                                # Find file
2882                                annotation_file_found = None
2883
2884                                # Expand user
2885                                annotation_file = full_path(annotation_file)
2886
2887                                if os.path.exists(annotation_file):
2888                                    annotation_file_found = annotation_file
2889
2890                                else:
2891                                    # Find within assembly folders
2892                                    for annotations_database in annotations_databases:
2893                                        found_files = find_all(
2894                                            annotation_file,
2895                                            os.path.join(
2896                                                annotations_database, assembly
2897                                            ),
2898                                        )
2899                                        if len(found_files) > 0:
2900                                            annotation_file_found = found_files[0]
2901                                            break
2902                                    if not annotation_file_found and not assembly:
2903                                        # Find within folders
2904                                        for (
2905                                            annotations_database
2906                                        ) in annotations_databases:
2907                                            found_files = find_all(
2908                                                annotation_file, annotations_database
2909                                            )
2910                                            if len(found_files) > 0:
2911                                                annotation_file_found = found_files[0]
2912                                                break
2913                                log.debug(
2914                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2915                                )
2916
2917                                # Full path
2918                                annotation_file_found = full_path(annotation_file_found)
2919
2920                                if annotation_file_found:
2921
2922                                    database = Database(database=annotation_file_found)
2923                                    quick_annotation_format = database.get_format()
2924                                    quick_annotation_is_compressed = (
2925                                        database.is_compressed()
2926                                    )
2927                                    quick_annotation_is_indexed = os.path.exists(
2928                                        f"{annotation_file_found}.tbi"
2929                                    )
2930                                    bcftools_preference = False
2931
2932                                    # Check Annotation Tool
2933                                    if not annotation_tool:
2934                                        if (
2935                                            bcftools_preference
2936                                            and quick_annotation_format
2937                                            in ["vcf", "bed"]
2938                                            and quick_annotation_is_compressed
2939                                            and quick_annotation_is_indexed
2940                                        ):
2941                                            annotation_tool = "bcftools"
2942                                        elif quick_annotation_format in [
2943                                            "vcf",
2944                                            "bed",
2945                                            "tsv",
2946                                            "tsv",
2947                                            "csv",
2948                                            "json",
2949                                            "tbl",
2950                                            "parquet",
2951                                            "duckdb",
2952                                        ]:
2953                                            annotation_tool = "parquet"
2954                                        else:
2955                                            log.error(
2956                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
2957                                            )
2958                                            raise ValueError(
2959                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
2960                                            )
2961
2962                                    log.debug(
2963                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
2964                                    )
2965
2966                                    # Annotation Tool dispatch
2967                                    if annotation_tool:
2968                                        if annotation_tool not in param["annotation"]:
2969                                            param["annotation"][annotation_tool] = {}
2970                                        if (
2971                                            "annotations"
2972                                            not in param["annotation"][annotation_tool]
2973                                        ):
2974                                            param["annotation"][annotation_tool][
2975                                                "annotations"
2976                                            ] = {}
2977                                        param["annotation"][annotation_tool][
2978                                            "annotations"
2979                                        ][annotation_file_found] = annotations
2980
2981                                else:
2982                                    log.error(
2983                                        f"Quick Annotation File {annotation_file} does NOT exist"
2984                                    )
2985
2986                self.set_param(param)
2987
2988        if param.get("annotation", None):
2989            log.info("Annotations")
2990            if param.get("annotation", {}).get("parquet", None):
2991                log.info("Annotations 'parquet'...")
2992                self.annotation_parquet()
2993            if param.get("annotation", {}).get("bcftools", None):
2994                log.info("Annotations 'bcftools'...")
2995                self.annotation_bcftools()
2996            if param.get("annotation", {}).get("snpsift", None):
2997                log.info("Annotations 'snpsift'...")
2998                self.annotation_snpsift()
2999            if param.get("annotation", {}).get("annovar", None):
3000                log.info("Annotations 'annovar'...")
3001                self.annotation_annovar()
3002            if param.get("annotation", {}).get("snpeff", None):
3003                log.info("Annotations 'snpeff'...")
3004                self.annotation_snpeff()
3005            if param.get("annotation", {}).get("exomiser", None) is not None:
3006                log.info("Annotations 'exomiser'...")
3007                self.annotation_exomiser()
3008            if param.get("annotation", {}).get("splice", None) is not None:
3009                log.info("Annotations 'splice' ...")
3010                self.annotation_splice()
3011
3012        # Explode INFOS fields into table fields
3013        if self.get_explode_infos():
3014            self.explode_infos(
3015                prefix=self.get_explode_infos_prefix(),
3016                fields=self.get_explode_infos_fields(),
3017                force=True,
3018            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_snpsift(self, threads: int = None) -> None:
3020    def annotation_snpsift(self, threads: int = None) -> None:
3021        """
3022        This function annotate with bcftools
3023
3024        :param threads: Number of threads to use
3025        :return: the value of the variable "return_value".
3026        """
3027
3028        # DEBUG
3029        log.debug("Start annotation with bcftools databases")
3030
3031        # Threads
3032        if not threads:
3033            threads = self.get_threads()
3034        log.debug("Threads: " + str(threads))
3035
3036        # Config
3037        config = self.get_config()
3038        log.debug("Config: " + str(config))
3039
3040        # Config - snpSift
3041        snpsift_bin_command = get_bin_command(
3042            bin="SnpSift.jar",
3043            tool="snpsift",
3044            bin_type="jar",
3045            config=config,
3046            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3047        )
3048        if not snpsift_bin_command:
3049            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3050            log.error(msg_err)
3051            raise ValueError(msg_err)
3052
3053        # Config - bcftools
3054        bcftools_bin_command = get_bin_command(
3055            bin="bcftools",
3056            tool="bcftools",
3057            bin_type="bin",
3058            config=config,
3059            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3060        )
3061        if not bcftools_bin_command:
3062            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3063            log.error(msg_err)
3064            raise ValueError(msg_err)
3065
3066        # Config - BCFTools databases folders
3067        databases_folders = set(
3068            self.get_config()
3069            .get("folders", {})
3070            .get("databases", {})
3071            .get("annotations", ["."])
3072            + self.get_config()
3073            .get("folders", {})
3074            .get("databases", {})
3075            .get("bcftools", ["."])
3076        )
3077        log.debug("Databases annotations: " + str(databases_folders))
3078
3079        # Param
3080        annotations = (
3081            self.get_param()
3082            .get("annotation", {})
3083            .get("snpsift", {})
3084            .get("annotations", None)
3085        )
3086        log.debug("Annotations: " + str(annotations))
3087
3088        # Assembly
3089        assembly = self.get_param().get(
3090            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3091        )
3092
3093        # Data
3094        table_variants = self.get_table_variants()
3095
3096        # Check if not empty
3097        log.debug("Check if not empty")
3098        sql_query_chromosomes = (
3099            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3100        )
3101        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3102        if not sql_query_chromosomes_df["count"][0]:
3103            log.info(f"VCF empty")
3104            return
3105
3106        # VCF header
3107        vcf_reader = self.get_header()
3108        log.debug("Initial header: " + str(vcf_reader.infos))
3109
3110        # Existing annotations
3111        for vcf_annotation in self.get_header().infos:
3112
3113            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3114            log.debug(
3115                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3116            )
3117
3118        if annotations:
3119
3120            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3121
3122                # Export VCF file
3123                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3124
3125                # Init
3126                commands = {}
3127
3128                for annotation in annotations:
3129                    annotation_fields = annotations[annotation]
3130
3131                    # Annotation Name
3132                    annotation_name = os.path.basename(annotation)
3133
3134                    if not annotation_fields:
3135                        annotation_fields = {"INFO": None}
3136
3137                    log.debug(f"Annotation '{annotation_name}'")
3138                    log.debug(
3139                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3140                    )
3141
3142                    # Create Database
3143                    database = Database(
3144                        database=annotation,
3145                        databases_folders=databases_folders,
3146                        assembly=assembly,
3147                    )
3148
3149                    # Find files
3150                    db_file = database.get_database()
3151                    db_file = full_path(db_file)
3152                    db_hdr_file = database.get_header_file()
3153                    db_hdr_file = full_path(db_hdr_file)
3154                    db_file_type = database.get_format()
3155                    db_tbi_file = f"{db_file}.tbi"
3156                    db_file_compressed = database.is_compressed()
3157
3158                    # Check if compressed
3159                    if not db_file_compressed:
3160                        log.error(
3161                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3162                        )
3163                        raise ValueError(
3164                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3165                        )
3166
3167                    # Check if indexed
3168                    if not os.path.exists(db_tbi_file):
3169                        log.error(
3170                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3171                        )
3172                        raise ValueError(
3173                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3174                        )
3175
3176                    # Check index - try to create if not exists
3177                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3178                        log.error("Annotation failed: database not valid")
3179                        log.error(f"Annotation annotation file: {db_file}")
3180                        log.error(f"Annotation annotation header: {db_hdr_file}")
3181                        log.error(f"Annotation annotation index: {db_tbi_file}")
3182                        raise ValueError(
3183                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3184                        )
3185                    else:
3186
3187                        log.debug(
3188                            f"Annotation '{annotation}' - file: "
3189                            + str(db_file)
3190                            + " and "
3191                            + str(db_hdr_file)
3192                        )
3193
3194                        # Load header as VCF object
3195                        db_hdr_vcf = Variants(input=db_hdr_file)
3196                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3197                        log.debug(
3198                            "Annotation database header: "
3199                            + str(db_hdr_vcf_header_infos)
3200                        )
3201
3202                        # For all fields in database
3203                        annotation_fields_full = False
3204                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3205                            annotation_fields = {
3206                                key: key for key in db_hdr_vcf_header_infos
3207                            }
3208                            log.debug(
3209                                "Annotation database header - All annotations added: "
3210                                + str(annotation_fields)
3211                            )
3212                            annotation_fields_full = True
3213
3214                        # # Create file for field rename
3215                        # log.debug("Create file for field rename")
3216                        # tmp_rename = NamedTemporaryFile(
3217                        #     prefix=self.get_prefix(),
3218                        #     dir=self.get_tmp_dir(),
3219                        #     suffix=".rename",
3220                        #     delete=False,
3221                        # )
3222                        # tmp_rename_name = tmp_rename.name
3223                        # tmp_files.append(tmp_rename_name)
3224
3225                        # Number of fields
3226                        nb_annotation_field = 0
3227                        annotation_list = []
3228                        annotation_infos_rename_list = []
3229
3230                        for annotation_field in annotation_fields:
3231
3232                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3233                            annotation_fields_new_name = annotation_fields.get(
3234                                annotation_field, annotation_field
3235                            )
3236                            if not annotation_fields_new_name:
3237                                annotation_fields_new_name = annotation_field
3238
3239                            # Check if field is in DB and if field is not elready in input data
3240                            if (
3241                                annotation_field in db_hdr_vcf.get_header().infos
3242                                and annotation_fields_new_name
3243                                not in self.get_header().infos
3244                            ):
3245
3246                                log.info(
3247                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3248                                )
3249
3250                                # BCFTools annotate param to rename fields
3251                                if annotation_field != annotation_fields_new_name:
3252                                    annotation_infos_rename_list.append(
3253                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3254                                    )
3255
3256                                # Add INFO field to header
3257                                db_hdr_vcf_header_infos_number = (
3258                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3259                                )
3260                                db_hdr_vcf_header_infos_type = (
3261                                    db_hdr_vcf_header_infos[annotation_field].type
3262                                    or "String"
3263                                )
3264                                db_hdr_vcf_header_infos_description = (
3265                                    db_hdr_vcf_header_infos[annotation_field].desc
3266                                    or f"{annotation_field} description"
3267                                )
3268                                db_hdr_vcf_header_infos_source = (
3269                                    db_hdr_vcf_header_infos[annotation_field].source
3270                                    or "unknown"
3271                                )
3272                                db_hdr_vcf_header_infos_version = (
3273                                    db_hdr_vcf_header_infos[annotation_field].version
3274                                    or "unknown"
3275                                )
3276
3277                                vcf_reader.infos[annotation_fields_new_name] = (
3278                                    vcf.parser._Info(
3279                                        annotation_fields_new_name,
3280                                        db_hdr_vcf_header_infos_number,
3281                                        db_hdr_vcf_header_infos_type,
3282                                        db_hdr_vcf_header_infos_description,
3283                                        db_hdr_vcf_header_infos_source,
3284                                        db_hdr_vcf_header_infos_version,
3285                                        self.code_type_map[
3286                                            db_hdr_vcf_header_infos_type
3287                                        ],
3288                                    )
3289                                )
3290
3291                                annotation_list.append(annotation_field)
3292
3293                                nb_annotation_field += 1
3294
3295                            else:
3296
3297                                if (
3298                                    annotation_field
3299                                    not in db_hdr_vcf.get_header().infos
3300                                ):
3301                                    log.warning(
3302                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3303                                    )
3304                                if (
3305                                    annotation_fields_new_name
3306                                    in self.get_header().infos
3307                                ):
3308                                    log.warning(
3309                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3310                                    )
3311
3312                        log.info(
3313                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3314                        )
3315
3316                        annotation_infos = ",".join(annotation_list)
3317
3318                        if annotation_infos != "":
3319
3320                            # Annotated VCF (and error file)
3321                            tmp_annotation_vcf_name = os.path.join(
3322                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3323                            )
3324                            tmp_annotation_vcf_name_err = (
3325                                tmp_annotation_vcf_name + ".err"
3326                            )
3327
3328                            # Add fields to annotate
3329                            if not annotation_fields_full:
3330                                annotation_infos_option = f"-info {annotation_infos}"
3331                            else:
3332                                annotation_infos_option = ""
3333
3334                            # Info fields rename
3335                            if annotation_infos_rename_list:
3336                                annotation_infos_rename = " -c " + ",".join(
3337                                    annotation_infos_rename_list
3338                                )
3339                            else:
3340                                annotation_infos_rename = ""
3341
3342                            # Annotate command
3343                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3344
3345                            # Add command
3346                            commands[command_annotate] = tmp_annotation_vcf_name
3347
3348                if commands:
3349
3350                    # Export VCF file
3351                    self.export_variant_vcf(
3352                        vcf_file=tmp_vcf_name,
3353                        remove_info=True,
3354                        add_samples=False,
3355                        index=True,
3356                    )
3357                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3358
3359                    # Num command
3360                    nb_command = 0
3361
3362                    # Annotate
3363                    for command_annotate in commands:
3364                        nb_command += 1
3365                        log.info(
3366                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3367                        )
3368                        log.debug(f"command_annotate={command_annotate}")
3369                        run_parallel_commands([command_annotate], threads)
3370
3371                        # Debug
3372                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3373
3374                        # Update variants
3375                        log.info(
3376                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3377                        )
3378                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3380    def annotation_bcftools(self, threads: int = None) -> None:
3381        """
3382        This function annotate with bcftools
3383
3384        :param threads: Number of threads to use
3385        :return: the value of the variable "return_value".
3386        """
3387
3388        # DEBUG
3389        log.debug("Start annotation with bcftools databases")
3390
3391        # Threads
3392        if not threads:
3393            threads = self.get_threads()
3394        log.debug("Threads: " + str(threads))
3395
3396        # Config
3397        config = self.get_config()
3398        log.debug("Config: " + str(config))
3399
3400        # DEBUG
3401        delete_tmp = True
3402        if self.get_config().get("verbosity", "warning") in ["debug"]:
3403            delete_tmp = False
3404            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3405
3406        # Config - BCFTools bin command
3407        bcftools_bin_command = get_bin_command(
3408            bin="bcftools",
3409            tool="bcftools",
3410            bin_type="bin",
3411            config=config,
3412            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3413        )
3414        if not bcftools_bin_command:
3415            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3416            log.error(msg_err)
3417            raise ValueError(msg_err)
3418
3419        # Config - BCFTools databases folders
3420        databases_folders = set(
3421            self.get_config()
3422            .get("folders", {})
3423            .get("databases", {})
3424            .get("annotations", ["."])
3425            + self.get_config()
3426            .get("folders", {})
3427            .get("databases", {})
3428            .get("bcftools", ["."])
3429        )
3430        log.debug("Databases annotations: " + str(databases_folders))
3431
3432        # Param
3433        annotations = (
3434            self.get_param()
3435            .get("annotation", {})
3436            .get("bcftools", {})
3437            .get("annotations", None)
3438        )
3439        log.debug("Annotations: " + str(annotations))
3440
3441        # Assembly
3442        assembly = self.get_param().get(
3443            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3444        )
3445
3446        # Data
3447        table_variants = self.get_table_variants()
3448
3449        # Check if not empty
3450        log.debug("Check if not empty")
3451        sql_query_chromosomes = (
3452            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3453        )
3454        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3455        if not sql_query_chromosomes_df["count"][0]:
3456            log.info(f"VCF empty")
3457            return
3458
3459        # Export in VCF
3460        log.debug("Create initial file to annotate")
3461        tmp_vcf = NamedTemporaryFile(
3462            prefix=self.get_prefix(),
3463            dir=self.get_tmp_dir(),
3464            suffix=".vcf.gz",
3465            delete=False,
3466        )
3467        tmp_vcf_name = tmp_vcf.name
3468
3469        # VCF header
3470        vcf_reader = self.get_header()
3471        log.debug("Initial header: " + str(vcf_reader.infos))
3472
3473        # Existing annotations
3474        for vcf_annotation in self.get_header().infos:
3475
3476            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3477            log.debug(
3478                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3479            )
3480
3481        if annotations:
3482
3483            tmp_ann_vcf_list = []
3484            commands = []
3485            tmp_files = []
3486            err_files = []
3487
3488            for annotation in annotations:
3489                annotation_fields = annotations[annotation]
3490
3491                # Annotation Name
3492                annotation_name = os.path.basename(annotation)
3493
3494                if not annotation_fields:
3495                    annotation_fields = {"INFO": None}
3496
3497                log.debug(f"Annotation '{annotation_name}'")
3498                log.debug(
3499                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3500                )
3501
3502                # Create Database
3503                database = Database(
3504                    database=annotation,
3505                    databases_folders=databases_folders,
3506                    assembly=assembly,
3507                )
3508
3509                # Find files
3510                db_file = database.get_database()
3511                db_file = full_path(db_file)
3512                db_hdr_file = database.get_header_file()
3513                db_hdr_file = full_path(db_hdr_file)
3514                db_file_type = database.get_format()
3515                db_tbi_file = f"{db_file}.tbi"
3516                db_file_compressed = database.is_compressed()
3517
3518                # Check if compressed
3519                if not db_file_compressed:
3520                    log.error(
3521                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3522                    )
3523                    raise ValueError(
3524                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3525                    )
3526
3527                # Check if indexed
3528                if not os.path.exists(db_tbi_file):
3529                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3530                    raise ValueError(
3531                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3532                    )
3533
3534                # Check index - try to create if not exists
3535                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3536                    log.error("Annotation failed: database not valid")
3537                    log.error(f"Annotation annotation file: {db_file}")
3538                    log.error(f"Annotation annotation header: {db_hdr_file}")
3539                    log.error(f"Annotation annotation index: {db_tbi_file}")
3540                    raise ValueError(
3541                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3542                    )
3543                else:
3544
3545                    log.debug(
3546                        f"Annotation '{annotation}' - file: "
3547                        + str(db_file)
3548                        + " and "
3549                        + str(db_hdr_file)
3550                    )
3551
3552                    # Load header as VCF object
3553                    db_hdr_vcf = Variants(input=db_hdr_file)
3554                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3555                    log.debug(
3556                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3557                    )
3558
3559                    # For all fields in database
3560                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3561                        annotation_fields = {
3562                            key: key for key in db_hdr_vcf_header_infos
3563                        }
3564                        log.debug(
3565                            "Annotation database header - All annotations added: "
3566                            + str(annotation_fields)
3567                        )
3568
3569                    # Number of fields
3570                    nb_annotation_field = 0
3571                    annotation_list = []
3572
3573                    for annotation_field in annotation_fields:
3574
3575                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3576                        annotation_fields_new_name = annotation_fields.get(
3577                            annotation_field, annotation_field
3578                        )
3579                        if not annotation_fields_new_name:
3580                            annotation_fields_new_name = annotation_field
3581
3582                        # Check if field is in DB and if field is not elready in input data
3583                        if (
3584                            annotation_field in db_hdr_vcf.get_header().infos
3585                            and annotation_fields_new_name
3586                            not in self.get_header().infos
3587                        ):
3588
3589                            log.info(
3590                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3591                            )
3592
3593                            # Add INFO field to header
3594                            db_hdr_vcf_header_infos_number = (
3595                                db_hdr_vcf_header_infos[annotation_field].num or "."
3596                            )
3597                            db_hdr_vcf_header_infos_type = (
3598                                db_hdr_vcf_header_infos[annotation_field].type
3599                                or "String"
3600                            )
3601                            db_hdr_vcf_header_infos_description = (
3602                                db_hdr_vcf_header_infos[annotation_field].desc
3603                                or f"{annotation_field} description"
3604                            )
3605                            db_hdr_vcf_header_infos_source = (
3606                                db_hdr_vcf_header_infos[annotation_field].source
3607                                or "unknown"
3608                            )
3609                            db_hdr_vcf_header_infos_version = (
3610                                db_hdr_vcf_header_infos[annotation_field].version
3611                                or "unknown"
3612                            )
3613
3614                            vcf_reader.infos[annotation_fields_new_name] = (
3615                                vcf.parser._Info(
3616                                    annotation_fields_new_name,
3617                                    db_hdr_vcf_header_infos_number,
3618                                    db_hdr_vcf_header_infos_type,
3619                                    db_hdr_vcf_header_infos_description,
3620                                    db_hdr_vcf_header_infos_source,
3621                                    db_hdr_vcf_header_infos_version,
3622                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3623                                )
3624                            )
3625
3626                            # annotation_list.append(annotation_field)
3627                            if annotation_field != annotation_fields_new_name:
3628                                annotation_list.append(
3629                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3630                                )
3631                            else:
3632                                annotation_list.append(annotation_field)
3633
3634                            nb_annotation_field += 1
3635
3636                        else:
3637
3638                            if annotation_field not in db_hdr_vcf.get_header().infos:
3639                                log.warning(
3640                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3641                                )
3642                            if annotation_fields_new_name in self.get_header().infos:
3643                                log.warning(
3644                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3645                                )
3646
3647                    log.info(
3648                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3649                    )
3650
3651                    annotation_infos = ",".join(annotation_list)
3652
3653                    if annotation_infos != "":
3654
3655                        # Protect header for bcftools (remove "#CHROM" and variants line)
3656                        log.debug("Protect Header file - remove #CHROM line if exists")
3657                        tmp_header_vcf = NamedTemporaryFile(
3658                            prefix=self.get_prefix(),
3659                            dir=self.get_tmp_dir(),
3660                            suffix=".hdr",
3661                            delete=False,
3662                        )
3663                        tmp_header_vcf_name = tmp_header_vcf.name
3664                        tmp_files.append(tmp_header_vcf_name)
3665                        # Command
3666                        if db_hdr_file.endswith(".gz"):
3667                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3668                        else:
3669                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3670                        # Run
3671                        run_parallel_commands([command_extract_header], 1)
3672
3673                        # Find chomosomes
3674                        log.debug("Find chromosomes ")
3675                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3676                        sql_query_chromosomes_df = self.get_query_to_df(
3677                            sql_query_chromosomes
3678                        )
3679                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3680
3681                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3682
3683                        # BED columns in the annotation file
3684                        if db_file_type in ["bed"]:
3685                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3686
3687                        for chrom in chomosomes_list:
3688
3689                            # Create BED on initial VCF
3690                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3691                            tmp_bed = NamedTemporaryFile(
3692                                prefix=self.get_prefix(),
3693                                dir=self.get_tmp_dir(),
3694                                suffix=".bed",
3695                                delete=False,
3696                            )
3697                            tmp_bed_name = tmp_bed.name
3698                            tmp_files.append(tmp_bed_name)
3699
3700                            # Detecte regions
3701                            log.debug(
3702                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3703                            )
3704                            window = 1000000
3705                            sql_query_intervals_for_bed = f"""
3706                                SELECT  \"#CHROM\",
3707                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3708                                        \"POS\"+{window}
3709                                FROM {table_variants} as table_variants
3710                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3711                            """
3712                            regions = self.conn.execute(
3713                                sql_query_intervals_for_bed
3714                            ).fetchall()
3715                            merged_regions = merge_regions(regions)
3716                            log.debug(
3717                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3718                            )
3719
3720                            header = ["#CHROM", "START", "END"]
3721                            with open(tmp_bed_name, "w") as f:
3722                                # Write the header with tab delimiter
3723                                f.write("\t".join(header) + "\n")
3724                                for d in merged_regions:
3725                                    # Write each data row with tab delimiter
3726                                    f.write("\t".join(map(str, d)) + "\n")
3727
3728                            # Tmp files
3729                            tmp_annotation_vcf = NamedTemporaryFile(
3730                                prefix=self.get_prefix(),
3731                                dir=self.get_tmp_dir(),
3732                                suffix=".vcf.gz",
3733                                delete=False,
3734                            )
3735                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3736                            tmp_files.append(tmp_annotation_vcf_name)
3737                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3738                            tmp_annotation_vcf_name_err = (
3739                                tmp_annotation_vcf_name + ".err"
3740                            )
3741                            err_files.append(tmp_annotation_vcf_name_err)
3742
3743                            # Annotate Command
3744                            log.debug(
3745                                f"Annotation '{annotation}' - add bcftools command"
3746                            )
3747
3748                            # Command
3749                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3750
3751                            # Add command
3752                            commands.append(command_annotate)
3753
3754            # if some commands
3755            if commands:
3756
3757                # Export VCF file
3758                self.export_variant_vcf(
3759                    vcf_file=tmp_vcf_name,
3760                    remove_info=True,
3761                    add_samples=False,
3762                    index=True,
3763                )
3764
3765                # Threads
3766                # calculate threads for annotated commands
3767                if commands:
3768                    threads_bcftools_annotate = round(threads / len(commands))
3769                else:
3770                    threads_bcftools_annotate = 1
3771
3772                if not threads_bcftools_annotate:
3773                    threads_bcftools_annotate = 1
3774
3775                # Add threads option to bcftools commands
3776                if threads_bcftools_annotate > 1:
3777                    commands_threaded = []
3778                    for command in commands:
3779                        commands_threaded.append(
3780                            command.replace(
3781                                f"{bcftools_bin_command} annotate ",
3782                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3783                            )
3784                        )
3785                    commands = commands_threaded
3786
3787                # Command annotation multithreading
3788                log.debug(f"Annotation - Annotation commands: " + str(commands))
3789                log.info(
3790                    f"Annotation - Annotation multithreaded in "
3791                    + str(len(commands))
3792                    + " commands"
3793                )
3794
3795                run_parallel_commands(commands, threads)
3796
3797                # Merge
3798                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3799
3800                if tmp_ann_vcf_list_cmd:
3801
3802                    # Tmp file
3803                    tmp_annotate_vcf = NamedTemporaryFile(
3804                        prefix=self.get_prefix(),
3805                        dir=self.get_tmp_dir(),
3806                        suffix=".vcf.gz",
3807                        delete=True,
3808                    )
3809                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3810                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3811                    err_files.append(tmp_annotate_vcf_name_err)
3812
3813                    # Tmp file remove command
3814                    tmp_files_remove_command = ""
3815                    if tmp_files:
3816                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3817
3818                    # Command merge
3819                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3820                    log.info(
3821                        f"Annotation - Annotation merging "
3822                        + str(len(commands))
3823                        + " annotated files"
3824                    )
3825                    log.debug(f"Annotation - merge command: {merge_command}")
3826                    run_parallel_commands([merge_command], 1)
3827
3828                    # Error messages
3829                    log.info(f"Error/Warning messages:")
3830                    error_message_command_all = []
3831                    error_message_command_warning = []
3832                    error_message_command_err = []
3833                    for err_file in err_files:
3834                        with open(err_file, "r") as f:
3835                            for line in f:
3836                                message = line.strip()
3837                                error_message_command_all.append(message)
3838                                if line.startswith("[W::"):
3839                                    error_message_command_warning.append(message)
3840                                if line.startswith("[E::"):
3841                                    error_message_command_err.append(
3842                                        f"{err_file}: " + message
3843                                    )
3844                    # log info
3845                    for message in list(
3846                        set(error_message_command_err + error_message_command_warning)
3847                    ):
3848                        log.info(f"   {message}")
3849                    # debug info
3850                    for message in list(set(error_message_command_all)):
3851                        log.debug(f"   {message}")
3852                    # failed
3853                    if len(error_message_command_err):
3854                        log.error("Annotation failed: Error in commands")
3855                        raise ValueError("Annotation failed: Error in commands")
3856
3857                    # Update variants
3858                    log.info(f"Annotation - Updating...")
3859                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
3861    def annotation_exomiser(self, threads: int = None) -> None:
3862        """
3863        This function annotate with Exomiser
3864
3865        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3866        - "analysis" (dict/file):
3867            Full analysis dictionnary parameters (see Exomiser docs).
3868            Either a dict, or a file in JSON or YAML format.
3869            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3870            Default : None
3871        - "preset" (string):
3872            Analysis preset (available in config folder).
3873            Used if no full "analysis" is provided.
3874            Default: "exome"
3875        - "phenopacket" (dict/file):
3876            Samples and phenotipic features parameters (see Exomiser docs).
3877            Either a dict, or a file in JSON or YAML format.
3878            Default: None
3879        - "subject" (dict):
3880            Sample parameters (see Exomiser docs).
3881            Example:
3882                "subject":
3883                    {
3884                        "id": "ISDBM322017",
3885                        "sex": "FEMALE"
3886                    }
3887            Default: None
3888        - "sample" (string):
3889            Sample name to construct "subject" section:
3890                "subject":
3891                    {
3892                        "id": "<sample>",
3893                        "sex": "UNKNOWN_SEX"
3894                    }
3895            Default: None
3896        - "phenotypicFeatures" (dict)
3897            Phenotypic features to construct "subject" section.
3898            Example:
3899                "phenotypicFeatures":
3900                    [
3901                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3902                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3903                    ]
3904        - "hpo" (list)
3905            List of HPO ids as phenotypic features.
3906            Example:
3907                "hpo": ['0001156', '0001363', '0011304', '0010055']
3908            Default: []
3909        - "outputOptions" (dict):
3910            Output options (see Exomiser docs).
3911            Default:
3912                "output_options" =
3913                    {
3914                        "outputContributingVariantsOnly": False,
3915                        "numGenes": 0,
3916                        "outputFormats": ["TSV_VARIANT", "VCF"]
3917                    }
3918        - "transcript_source" (string):
3919            Transcript source (either "refseq", "ucsc", "ensembl")
3920            Default: "refseq"
3921        - "exomiser_to_info" (boolean):
3922            Add exomiser TSV file columns as INFO fields in VCF.
3923            Default: False
3924        - "release" (string):
3925            Exomise database release.
3926            If not exists, database release will be downloaded (take a while).
3927            Default: None (provided by application.properties configuration file)
3928        - "exomiser_application_properties" (file):
3929            Exomiser configuration file (see Exomiser docs).
3930            Useful to automatically download databases (especially for specific genome databases).
3931
3932        Notes:
3933        - If no sample in parameters, first sample in VCF will be chosen
3934        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
3935
3936        :param threads: The number of threads to use
3937        :return: None.
3938        """
3939
3940        # DEBUG
3941        log.debug("Start annotation with Exomiser databases")
3942
3943        # Threads
3944        if not threads:
3945            threads = self.get_threads()
3946        log.debug("Threads: " + str(threads))
3947
3948        # Config
3949        config = self.get_config()
3950        log.debug("Config: " + str(config))
3951
3952        # Config - Folders - Databases
3953        databases_folders = (
3954            config.get("folders", {})
3955            .get("databases", {})
3956            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
3957        )
3958        databases_folders = full_path(databases_folders)
3959        if not os.path.exists(databases_folders):
3960            log.error(f"Databases annotations: {databases_folders} NOT found")
3961        log.debug("Databases annotations: " + str(databases_folders))
3962
3963        # Config - Exomiser
3964        exomiser_bin_command = get_bin_command(
3965            bin="exomiser-cli*.jar",
3966            tool="exomiser",
3967            bin_type="jar",
3968            config=config,
3969            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
3970        )
3971        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
3972        if not exomiser_bin_command:
3973            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
3974            log.error(msg_err)
3975            raise ValueError(msg_err)
3976
3977        # Param
3978        param = self.get_param()
3979        log.debug("Param: " + str(param))
3980
3981        # Param - Exomiser
3982        param_exomiser = param.get("annotation", {}).get("exomiser", {})
3983        log.debug(f"Param Exomiser: {param_exomiser}")
3984
3985        # Param - Assembly
3986        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
3987        log.debug("Assembly: " + str(assembly))
3988
3989        # Data
3990        table_variants = self.get_table_variants()
3991
3992        # Check if not empty
3993        log.debug("Check if not empty")
3994        sql_query_chromosomes = (
3995            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3996        )
3997        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
3998            log.info(f"VCF empty")
3999            return False
4000
4001        # VCF header
4002        vcf_reader = self.get_header()
4003        log.debug("Initial header: " + str(vcf_reader.infos))
4004
4005        # Samples
4006        samples = self.get_header_sample_list()
4007        if not samples:
4008            log.error("No Samples in VCF")
4009            return False
4010        log.debug(f"Samples: {samples}")
4011
4012        # Memory limit
4013        memory_limit = self.get_memory("8G")
4014        log.debug(f"memory_limit: {memory_limit}")
4015
4016        # Exomiser java options
4017        exomiser_java_options = (
4018            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4019        )
4020        log.debug(f"Exomiser java options: {exomiser_java_options}")
4021
4022        # Download Exomiser (if not exists)
4023        exomiser_release = param_exomiser.get("release", None)
4024        exomiser_application_properties = param_exomiser.get(
4025            "exomiser_application_properties", None
4026        )
4027        databases_download_exomiser(
4028            assemblies=[assembly],
4029            exomiser_folder=databases_folders,
4030            exomiser_release=exomiser_release,
4031            exomiser_phenotype_release=exomiser_release,
4032            exomiser_application_properties=exomiser_application_properties,
4033        )
4034
4035        # Force annotation
4036        force_update_annotation = True
4037
4038        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4039            log.debug("Start annotation Exomiser")
4040
4041            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4042
4043                # tmp_dir = "/tmp/exomiser"
4044
4045                ### ANALYSIS ###
4046                ################
4047
4048                # Create analysis.json through analysis dict
4049                # either analysis in param or by default
4050                # depending on preset exome/genome)
4051
4052                # Init analysis dict
4053                param_exomiser_analysis_dict = {}
4054
4055                # analysis from param
4056                param_exomiser_analysis = param_exomiser.get("analysis", {})
4057                param_exomiser_analysis = full_path(param_exomiser_analysis)
4058
4059                # If analysis in param -> load anlaysis json
4060                if param_exomiser_analysis:
4061
4062                    # If param analysis is a file and exists
4063                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4064                        param_exomiser_analysis
4065                    ):
4066                        # Load analysis file into analysis dict (either yaml or json)
4067                        with open(param_exomiser_analysis) as json_file:
4068                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4069
4070                    # If param analysis is a dict
4071                    elif isinstance(param_exomiser_analysis, dict):
4072                        # Load analysis dict into analysis dict (either yaml or json)
4073                        param_exomiser_analysis_dict = param_exomiser_analysis
4074
4075                    # Error analysis type
4076                    else:
4077                        log.error(f"Analysis type unknown. Check param file.")
4078                        raise ValueError(f"Analysis type unknown. Check param file.")
4079
4080                # Case no input analysis config file/dict
4081                # Use preset (exome/genome) to open default config file
4082                if not param_exomiser_analysis_dict:
4083
4084                    # default preset
4085                    default_preset = "exome"
4086
4087                    # Get param preset or default preset
4088                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4089
4090                    # Try to find if preset is a file
4091                    if os.path.exists(param_exomiser_preset):
4092                        # Preset file is provided in full path
4093                        param_exomiser_analysis_default_config_file = (
4094                            param_exomiser_preset
4095                        )
4096                    # elif os.path.exists(full_path(param_exomiser_preset)):
4097                    #     # Preset file is provided in full path
4098                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4099                    elif os.path.exists(
4100                        os.path.join(folder_config, param_exomiser_preset)
4101                    ):
4102                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4103                        param_exomiser_analysis_default_config_file = os.path.join(
4104                            folder_config, param_exomiser_preset
4105                        )
4106                    else:
4107                        # Construct preset file
4108                        param_exomiser_analysis_default_config_file = os.path.join(
4109                            folder_config,
4110                            f"preset-{param_exomiser_preset}-analysis.json",
4111                        )
4112
4113                    # If preset file exists
4114                    param_exomiser_analysis_default_config_file = full_path(
4115                        param_exomiser_analysis_default_config_file
4116                    )
4117                    if os.path.exists(param_exomiser_analysis_default_config_file):
4118                        # Load prest file into analysis dict (either yaml or json)
4119                        with open(
4120                            param_exomiser_analysis_default_config_file
4121                        ) as json_file:
4122                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4123                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4124                                json_file
4125                            )
4126
4127                    # Error preset file
4128                    else:
4129                        log.error(
4130                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4131                        )
4132                        raise ValueError(
4133                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4134                        )
4135
4136                # If no analysis dict created
4137                if not param_exomiser_analysis_dict:
4138                    log.error(f"No analysis config")
4139                    raise ValueError(f"No analysis config")
4140
4141                # Log
4142                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4143
4144                ### PHENOPACKET ###
4145                ###################
4146
4147                # If no PhenoPacket in analysis dict -> check in param
4148                if "phenopacket" not in param_exomiser_analysis_dict:
4149
4150                    # If PhenoPacket in param -> load anlaysis json
4151                    if param_exomiser.get("phenopacket", None):
4152
4153                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4154                        param_exomiser_phenopacket = full_path(
4155                            param_exomiser_phenopacket
4156                        )
4157
4158                        # If param phenopacket is a file and exists
4159                        if isinstance(
4160                            param_exomiser_phenopacket, str
4161                        ) and os.path.exists(param_exomiser_phenopacket):
4162                            # Load phenopacket file into analysis dict (either yaml or json)
4163                            with open(param_exomiser_phenopacket) as json_file:
4164                                param_exomiser_analysis_dict["phenopacket"] = (
4165                                    yaml.safe_load(json_file)
4166                                )
4167
4168                        # If param phenopacket is a dict
4169                        elif isinstance(param_exomiser_phenopacket, dict):
4170                            # Load phenopacket dict into analysis dict (either yaml or json)
4171                            param_exomiser_analysis_dict["phenopacket"] = (
4172                                param_exomiser_phenopacket
4173                            )
4174
4175                        # Error phenopacket type
4176                        else:
4177                            log.error(f"Phenopacket type unknown. Check param file.")
4178                            raise ValueError(
4179                                f"Phenopacket type unknown. Check param file."
4180                            )
4181
4182                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4183                if "phenopacket" not in param_exomiser_analysis_dict:
4184
4185                    # Init PhenoPacket
4186                    param_exomiser_analysis_dict["phenopacket"] = {
4187                        "id": "analysis",
4188                        "proband": {},
4189                    }
4190
4191                    ### Add subject ###
4192
4193                    # If subject exists
4194                    param_exomiser_subject = param_exomiser.get("subject", {})
4195
4196                    # If subject not exists -> found sample ID
4197                    if not param_exomiser_subject:
4198
4199                        # Found sample ID in param
4200                        sample = param_exomiser.get("sample", None)
4201
4202                        # Find sample ID (first sample)
4203                        if not sample:
4204                            sample_list = self.get_header_sample_list()
4205                            if len(sample_list) > 0:
4206                                sample = sample_list[0]
4207                            else:
4208                                log.error(f"No sample found")
4209                                raise ValueError(f"No sample found")
4210
4211                        # Create subject
4212                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4213
4214                    # Add to dict
4215                    param_exomiser_analysis_dict["phenopacket"][
4216                        "subject"
4217                    ] = param_exomiser_subject
4218
4219                    ### Add "phenotypicFeatures" ###
4220
4221                    # If phenotypicFeatures exists
4222                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4223                        "phenotypicFeatures", []
4224                    )
4225
4226                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4227                    if not param_exomiser_phenotypicfeatures:
4228
4229                        # Found HPO in param
4230                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4231
4232                        # Split HPO if list in string format separated by comma
4233                        if isinstance(param_exomiser_hpo, str):
4234                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4235
4236                        # Create HPO list
4237                        for hpo in param_exomiser_hpo:
4238                            hpo_clean = re.sub("[^0-9]", "", hpo)
4239                            param_exomiser_phenotypicfeatures.append(
4240                                {
4241                                    "type": {
4242                                        "id": f"HP:{hpo_clean}",
4243                                        "label": f"HP:{hpo_clean}",
4244                                    }
4245                                }
4246                            )
4247
4248                    # Add to dict
4249                    param_exomiser_analysis_dict["phenopacket"][
4250                        "phenotypicFeatures"
4251                    ] = param_exomiser_phenotypicfeatures
4252
4253                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4254                    if not param_exomiser_phenotypicfeatures:
4255                        for step in param_exomiser_analysis_dict.get(
4256                            "analysis", {}
4257                        ).get("steps", []):
4258                            if "hiPhivePrioritiser" in step:
4259                                param_exomiser_analysis_dict.get("analysis", {}).get(
4260                                    "steps", []
4261                                ).remove(step)
4262
4263                ### Add Input File ###
4264
4265                # Initial file name and htsFiles
4266                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4267                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4268                    {
4269                        "uri": tmp_vcf_name,
4270                        "htsFormat": "VCF",
4271                        "genomeAssembly": assembly,
4272                    }
4273                ]
4274
4275                ### Add metaData ###
4276
4277                # If metaData not in analysis dict
4278                if "metaData" not in param_exomiser_analysis_dict:
4279                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4280                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4281                        "createdBy": "howard",
4282                        "phenopacketSchemaVersion": 1,
4283                    }
4284
4285                ### OutputOptions ###
4286
4287                # Init output result folder
4288                output_results = os.path.join(tmp_dir, "results")
4289
4290                # If no outputOptions in analysis dict
4291                if "outputOptions" not in param_exomiser_analysis_dict:
4292
4293                    # default output formats
4294                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4295
4296                    # Get outputOptions in param
4297                    output_options = param_exomiser.get("outputOptions", None)
4298
4299                    # If no output_options in param -> check
4300                    if not output_options:
4301                        output_options = {
4302                            "outputContributingVariantsOnly": False,
4303                            "numGenes": 0,
4304                            "outputFormats": defaut_output_formats,
4305                        }
4306
4307                    # Replace outputDirectory in output options
4308                    output_options["outputDirectory"] = output_results
4309                    output_options["outputFileName"] = "howard"
4310
4311                    # Add outputOptions in analysis dict
4312                    param_exomiser_analysis_dict["outputOptions"] = output_options
4313
4314                else:
4315
4316                    # Replace output_results and output format (if exists in param)
4317                    param_exomiser_analysis_dict["outputOptions"][
4318                        "outputDirectory"
4319                    ] = output_results
4320                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4321                        list(
4322                            set(
4323                                param_exomiser_analysis_dict.get(
4324                                    "outputOptions", {}
4325                                ).get("outputFormats", [])
4326                                + ["TSV_VARIANT", "VCF"]
4327                            )
4328                        )
4329                    )
4330
4331                # log
4332                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4333
4334                ### ANALYSIS FILE ###
4335                #####################
4336
4337                ### Full JSON analysis config file ###
4338
4339                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4340                with open(exomiser_analysis, "w") as fp:
4341                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4342
4343                ### SPLIT analysis and sample config files
4344
4345                # Splitted analysis dict
4346                param_exomiser_analysis_dict_for_split = (
4347                    param_exomiser_analysis_dict.copy()
4348                )
4349
4350                # Phenopacket JSON file
4351                exomiser_analysis_phenopacket = os.path.join(
4352                    tmp_dir, "analysis_phenopacket.json"
4353                )
4354                with open(exomiser_analysis_phenopacket, "w") as fp:
4355                    json.dump(
4356                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4357                        fp,
4358                        indent=4,
4359                    )
4360
4361                # Analysis JSON file without Phenopacket parameters
4362                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4363                exomiser_analysis_analysis = os.path.join(
4364                    tmp_dir, "analysis_analysis.json"
4365                )
4366                with open(exomiser_analysis_analysis, "w") as fp:
4367                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4368
4369                ### INITAL VCF file ###
4370                #######################
4371
4372                ### Create list of samples to use and include inti initial VCF file ####
4373
4374                # Subject (main sample)
4375                # Get sample ID in analysis dict
4376                sample_subject = (
4377                    param_exomiser_analysis_dict.get("phenopacket", {})
4378                    .get("subject", {})
4379                    .get("id", None)
4380                )
4381                sample_proband = (
4382                    param_exomiser_analysis_dict.get("phenopacket", {})
4383                    .get("proband", {})
4384                    .get("subject", {})
4385                    .get("id", None)
4386                )
4387                sample = []
4388                if sample_subject:
4389                    sample.append(sample_subject)
4390                if sample_proband:
4391                    sample.append(sample_proband)
4392
4393                # Get sample ID within Pedigree
4394                pedigree_persons_list = (
4395                    param_exomiser_analysis_dict.get("phenopacket", {})
4396                    .get("pedigree", {})
4397                    .get("persons", {})
4398                )
4399
4400                # Create list with all sample ID in pedigree (if exists)
4401                pedigree_persons = []
4402                for person in pedigree_persons_list:
4403                    pedigree_persons.append(person.get("individualId"))
4404
4405                # Concat subject sample ID and samples ID in pedigreesamples
4406                samples = list(set(sample + pedigree_persons))
4407
4408                # Check if sample list is not empty
4409                if not samples:
4410                    log.error(f"No samples found")
4411                    raise ValueError(f"No samples found")
4412
4413                # Create VCF with sample (either sample in param or first one by default)
4414                # Export VCF file
4415                self.export_variant_vcf(
4416                    vcf_file=tmp_vcf_name,
4417                    remove_info=True,
4418                    add_samples=True,
4419                    list_samples=samples,
4420                    index=False,
4421                )
4422
4423                ### Execute Exomiser ###
4424                ########################
4425
4426                # Init command
4427                exomiser_command = ""
4428
4429                # Command exomiser options
4430                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4431
4432                # Release
4433                exomiser_release = param_exomiser.get("release", None)
4434                if exomiser_release:
4435                    # phenotype data version
4436                    exomiser_options += (
4437                        f" --exomiser.phenotype.data-version={exomiser_release} "
4438                    )
4439                    # data version
4440                    exomiser_options += (
4441                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4442                    )
4443                    # variant white list
4444                    variant_white_list_file = (
4445                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4446                    )
4447                    if os.path.exists(
4448                        os.path.join(
4449                            databases_folders, assembly, variant_white_list_file
4450                        )
4451                    ):
4452                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4453
4454                # transcript_source
4455                transcript_source = param_exomiser.get(
4456                    "transcript_source", None
4457                )  # ucsc, refseq, ensembl
4458                if transcript_source:
4459                    exomiser_options += (
4460                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4461                    )
4462
4463                # If analysis contain proband param
4464                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4465                    "proband", {}
4466                ):
4467                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4468
4469                # If no proband (usually uniq sample)
4470                else:
4471                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4472
4473                # Log
4474                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4475
4476                # Run command
4477                result = subprocess.call(
4478                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4479                )
4480                if result:
4481                    log.error("Exomiser command failed")
4482                    raise ValueError("Exomiser command failed")
4483
4484                ### RESULTS ###
4485                ###############
4486
4487                ### Annotate with TSV fields ###
4488
4489                # Init result tsv file
4490                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4491
4492                # Init result tsv file
4493                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4494
4495                # Parse TSV file and explode columns in INFO field
4496                if exomiser_to_info and os.path.exists(output_results_tsv):
4497
4498                    # Log
4499                    log.debug("Exomiser columns to VCF INFO field")
4500
4501                    # Retrieve columns and types
4502                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4503                    output_results_tsv_df = self.get_query_to_df(query)
4504                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4505
4506                    # Init concat fields for update
4507                    sql_query_update_concat_fields = []
4508
4509                    # Fields to avoid
4510                    fields_to_avoid = [
4511                        "CONTIG",
4512                        "START",
4513                        "END",
4514                        "REF",
4515                        "ALT",
4516                        "QUAL",
4517                        "FILTER",
4518                        "GENOTYPE",
4519                    ]
4520
4521                    # List all columns to add into header
4522                    for header_column in output_results_tsv_columns:
4523
4524                        # If header column is enable
4525                        if header_column not in fields_to_avoid:
4526
4527                            # Header info type
4528                            header_info_type = "String"
4529                            header_column_df = output_results_tsv_df[header_column]
4530                            header_column_df_dtype = header_column_df.dtype
4531                            if header_column_df_dtype == object:
4532                                if (
4533                                    pd.to_numeric(header_column_df, errors="coerce")
4534                                    .notnull()
4535                                    .all()
4536                                ):
4537                                    header_info_type = "Float"
4538                            else:
4539                                header_info_type = "Integer"
4540
4541                            # Header info
4542                            characters_to_validate = ["-"]
4543                            pattern = "[" + "".join(characters_to_validate) + "]"
4544                            header_info_name = re.sub(
4545                                pattern,
4546                                "_",
4547                                f"Exomiser_{header_column}".replace("#", ""),
4548                            )
4549                            header_info_number = "."
4550                            header_info_description = (
4551                                f"Exomiser {header_column} annotation"
4552                            )
4553                            header_info_source = "Exomiser"
4554                            header_info_version = "unknown"
4555                            header_info_code = CODE_TYPE_MAP[header_info_type]
4556                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4557                                header_info_name,
4558                                header_info_number,
4559                                header_info_type,
4560                                header_info_description,
4561                                header_info_source,
4562                                header_info_version,
4563                                header_info_code,
4564                            )
4565
4566                            # Add field to add for update to concat fields
4567                            sql_query_update_concat_fields.append(
4568                                f"""
4569                                CASE
4570                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4571                                    THEN concat(
4572                                        '{header_info_name}=',
4573                                        table_parquet."{header_column}",
4574                                        ';'
4575                                        )
4576
4577                                    ELSE ''
4578                                END
4579                            """
4580                            )
4581
4582                    # Update query
4583                    sql_query_update = f"""
4584                        UPDATE {table_variants} as table_variants
4585                            SET INFO = concat(
4586                                            CASE
4587                                                WHEN INFO NOT IN ('', '.')
4588                                                THEN INFO
4589                                                ELSE ''
4590                                            END,
4591                                            CASE
4592                                                WHEN table_variants.INFO NOT IN ('','.')
4593                                                THEN ';'
4594                                                ELSE ''
4595                                            END,
4596                                            (
4597                                            SELECT 
4598                                                concat(
4599                                                    {",".join(sql_query_update_concat_fields)}
4600                                                )
4601                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4602                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4603                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4604                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4605                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4606                                            )
4607                                        )
4608                            ;
4609                        """
4610
4611                    # Update
4612                    self.conn.execute(sql_query_update)
4613
4614                ### Annotate with VCF INFO field ###
4615
4616                # Init result VCF file
4617                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4618
4619                # If VCF exists
4620                if os.path.exists(output_results_vcf):
4621
4622                    # Log
4623                    log.debug("Exomiser result VCF update variants")
4624
4625                    # Find Exomiser INFO field annotation in header
4626                    with gzip.open(output_results_vcf, "rt") as f:
4627                        header_list = self.read_vcf_header(f)
4628                    exomiser_vcf_header = vcf.Reader(
4629                        io.StringIO("\n".join(header_list))
4630                    )
4631
4632                    # Add annotation INFO field to header
4633                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4634
4635                    # Update variants with VCF
4636                    self.update_from_vcf(output_results_vcf)
4637
4638        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
4640    def annotation_snpeff(self, threads: int = None) -> None:
4641        """
4642        This function annotate with snpEff
4643
4644        :param threads: The number of threads to use
4645        :return: the value of the variable "return_value".
4646        """
4647
4648        # DEBUG
4649        log.debug("Start annotation with snpeff databases")
4650
4651        # Threads
4652        if not threads:
4653            threads = self.get_threads()
4654        log.debug("Threads: " + str(threads))
4655
4656        # DEBUG
4657        delete_tmp = True
4658        if self.get_config().get("verbosity", "warning") in ["debug"]:
4659            delete_tmp = False
4660            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4661
4662        # Config
4663        config = self.get_config()
4664        log.debug("Config: " + str(config))
4665
4666        # Config - Folders - Databases
4667        databases_folders = (
4668            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4669        )
4670        log.debug("Databases annotations: " + str(databases_folders))
4671
4672        # # Config - Java
4673        # java_bin = get_bin(
4674        #     tool="java",
4675        #     bin="java",
4676        #     bin_type="bin",
4677        #     config=config,
4678        #     default_folder="/usr/bin",
4679        # )
4680        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4681        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4682        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4683
4684        # # Config - snpEff bin
4685        # snpeff_jar = get_bin(
4686        #     tool="snpeff",
4687        #     bin="snpEff.jar",
4688        #     bin_type="jar",
4689        #     config=config,
4690        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4691        # )
4692        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4693        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4694        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4695
4696        # Config - snpEff bin command
4697        snpeff_bin_command = get_bin_command(
4698            bin="snpEff.jar",
4699            tool="snpeff",
4700            bin_type="jar",
4701            config=config,
4702            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4703        )
4704        if not snpeff_bin_command:
4705            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4706            log.error(msg_err)
4707            raise ValueError(msg_err)
4708
4709        # Config - snpEff databases
4710        snpeff_databases = (
4711            config.get("folders", {})
4712            .get("databases", {})
4713            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4714        )
4715        snpeff_databases = full_path(snpeff_databases)
4716        if snpeff_databases is not None and snpeff_databases != "":
4717            log.debug(f"Create snpEff databases folder")
4718            if not os.path.exists(snpeff_databases):
4719                os.makedirs(snpeff_databases)
4720
4721        # Param
4722        param = self.get_param()
4723        log.debug("Param: " + str(param))
4724
4725        # Param
4726        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4727        log.debug("Options: " + str(options))
4728
4729        # Param - Assembly
4730        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4731
4732        # Param - Options
4733        snpeff_options = (
4734            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4735        )
4736        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4737        snpeff_csvstats = (
4738            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4739        )
4740        if snpeff_stats:
4741            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4742            snpeff_stats = full_path(snpeff_stats)
4743            snpeff_options += f" -stats {snpeff_stats}"
4744        if snpeff_csvstats:
4745            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4746            snpeff_csvstats = full_path(snpeff_csvstats)
4747            snpeff_options += f" -csvStats {snpeff_csvstats}"
4748
4749        # Data
4750        table_variants = self.get_table_variants()
4751
4752        # Check if not empty
4753        log.debug("Check if not empty")
4754        sql_query_chromosomes = (
4755            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4756        )
4757        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4758        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4759            log.info(f"VCF empty")
4760            return
4761
4762        # Export in VCF
4763        log.debug("Create initial file to annotate")
4764        tmp_vcf = NamedTemporaryFile(
4765            prefix=self.get_prefix(),
4766            dir=self.get_tmp_dir(),
4767            suffix=".vcf.gz",
4768            delete=True,
4769        )
4770        tmp_vcf_name = tmp_vcf.name
4771
4772        # VCF header
4773        vcf_reader = self.get_header()
4774        log.debug("Initial header: " + str(vcf_reader.infos))
4775
4776        # Existing annotations
4777        for vcf_annotation in self.get_header().infos:
4778
4779            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4780            log.debug(
4781                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4782            )
4783
4784        # Memory limit
4785        # if config.get("memory", None):
4786        #     memory_limit = config.get("memory", "8G")
4787        # else:
4788        #     memory_limit = "8G"
4789        memory_limit = self.get_memory("8G")
4790        log.debug(f"memory_limit: {memory_limit}")
4791
4792        # snpEff java options
4793        snpeff_java_options = (
4794            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4795        )
4796        log.debug(f"Exomiser java options: {snpeff_java_options}")
4797
4798        force_update_annotation = True
4799
4800        if "ANN" not in self.get_header().infos or force_update_annotation:
4801
4802            # Check snpEff database
4803            log.debug(f"Check snpEff databases {[assembly]}")
4804            databases_download_snpeff(
4805                folder=snpeff_databases, assemblies=[assembly], config=config
4806            )
4807
4808            # Export VCF file
4809            self.export_variant_vcf(
4810                vcf_file=tmp_vcf_name,
4811                remove_info=True,
4812                add_samples=False,
4813                index=True,
4814            )
4815
4816            # Tmp file
4817            err_files = []
4818            tmp_annotate_vcf = NamedTemporaryFile(
4819                prefix=self.get_prefix(),
4820                dir=self.get_tmp_dir(),
4821                suffix=".vcf",
4822                delete=False,
4823            )
4824            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4825            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4826            err_files.append(tmp_annotate_vcf_name_err)
4827
4828            # Command
4829            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4830            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4831            run_parallel_commands([snpeff_command], 1)
4832
4833            # Error messages
4834            log.info(f"Error/Warning messages:")
4835            error_message_command_all = []
4836            error_message_command_warning = []
4837            error_message_command_err = []
4838            for err_file in err_files:
4839                with open(err_file, "r") as f:
4840                    for line in f:
4841                        message = line.strip()
4842                        error_message_command_all.append(message)
4843                        if line.startswith("[W::"):
4844                            error_message_command_warning.append(message)
4845                        if line.startswith("[E::"):
4846                            error_message_command_err.append(f"{err_file}: " + message)
4847            # log info
4848            for message in list(
4849                set(error_message_command_err + error_message_command_warning)
4850            ):
4851                log.info(f"   {message}")
4852            # debug info
4853            for message in list(set(error_message_command_all)):
4854                log.debug(f"   {message}")
4855            # failed
4856            if len(error_message_command_err):
4857                log.error("Annotation failed: Error in commands")
4858                raise ValueError("Annotation failed: Error in commands")
4859
4860            # Find annotation in header
4861            with open(tmp_annotate_vcf_name, "rt") as f:
4862                header_list = self.read_vcf_header(f)
4863            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4864
4865            for ann in annovar_vcf_header.infos:
4866                if ann not in self.get_header().infos:
4867                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4868
4869            # Update variants
4870            log.info(f"Annotation - Updating...")
4871            self.update_from_vcf(tmp_annotate_vcf_name)
4872
4873        else:
4874            if "ANN" in self.get_header().infos:
4875                log.debug(f"Existing snpEff annotations in VCF")
4876            if force_update_annotation:
4877                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
4879    def annotation_annovar(self, threads: int = None) -> None:
4880        """
4881        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4882        annotations
4883
4884        :param threads: number of threads to use
4885        :return: the value of the variable "return_value".
4886        """
4887
4888        # DEBUG
4889        log.debug("Start annotation with Annovar databases")
4890
4891        # Threads
4892        if not threads:
4893            threads = self.get_threads()
4894        log.debug("Threads: " + str(threads))
4895
4896        # Tmp en Err files
4897        tmp_files = []
4898        err_files = []
4899
4900        # DEBUG
4901        delete_tmp = True
4902        if self.get_config().get("verbosity", "warning") in ["debug"]:
4903            delete_tmp = False
4904            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4905
4906        # Config
4907        config = self.get_config()
4908        log.debug("Config: " + str(config))
4909
4910        # Config - Folders - Databases
4911        databases_folders = (
4912            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4913        )
4914        log.debug("Databases annotations: " + str(databases_folders))
4915
4916        # Config - annovar bin command
4917        annovar_bin_command = get_bin_command(
4918            bin="table_annovar.pl",
4919            tool="annovar",
4920            bin_type="perl",
4921            config=config,
4922            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
4923        )
4924        if not annovar_bin_command:
4925            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
4926            log.error(msg_err)
4927            raise ValueError(msg_err)
4928
4929        # Config - BCFTools bin command
4930        bcftools_bin_command = get_bin_command(
4931            bin="bcftools",
4932            tool="bcftools",
4933            bin_type="bin",
4934            config=config,
4935            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
4936        )
4937        if not bcftools_bin_command:
4938            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
4939            log.error(msg_err)
4940            raise ValueError(msg_err)
4941
4942        # Config - annovar databases
4943        annovar_databases = (
4944            config.get("folders", {})
4945            .get("databases", {})
4946            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
4947        )
4948        annovar_databases = full_path(annovar_databases)
4949        if annovar_databases != "" and not os.path.exists(annovar_databases):
4950            os.makedirs(annovar_databases)
4951
4952        # Param
4953        param = self.get_param()
4954        log.debug("Param: " + str(param))
4955
4956        # Param - options
4957        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
4958        log.debug("Options: " + str(options))
4959
4960        # Param - annotations
4961        annotations = (
4962            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
4963        )
4964        log.debug("Annotations: " + str(annotations))
4965
4966        # Param - Assembly
4967        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4968
4969        # Annovar database assembly
4970        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
4971        if annovar_databases_assembly != "" and not os.path.exists(
4972            annovar_databases_assembly
4973        ):
4974            os.makedirs(annovar_databases_assembly)
4975
4976        # Data
4977        table_variants = self.get_table_variants()
4978
4979        # Check if not empty
4980        log.debug("Check if not empty")
4981        sql_query_chromosomes = (
4982            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4983        )
4984        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
4985        if not sql_query_chromosomes_df["count"][0]:
4986            log.info(f"VCF empty")
4987            return
4988
4989        # VCF header
4990        vcf_reader = self.get_header()
4991        log.debug("Initial header: " + str(vcf_reader.infos))
4992
4993        # Existing annotations
4994        for vcf_annotation in self.get_header().infos:
4995
4996            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4997            log.debug(
4998                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4999            )
5000
5001        force_update_annotation = True
5002
5003        if annotations:
5004
5005            commands = []
5006            tmp_annotates_vcf_name_list = []
5007
5008            # Export in VCF
5009            log.debug("Create initial file to annotate")
5010            tmp_vcf = NamedTemporaryFile(
5011                prefix=self.get_prefix(),
5012                dir=self.get_tmp_dir(),
5013                suffix=".vcf.gz",
5014                delete=False,
5015            )
5016            tmp_vcf_name = tmp_vcf.name
5017            tmp_files.append(tmp_vcf_name)
5018            tmp_files.append(tmp_vcf_name + ".tbi")
5019
5020            # Export VCF file
5021            self.export_variant_vcf(
5022                vcf_file=tmp_vcf_name,
5023                remove_info=".",
5024                add_samples=False,
5025                index=True,
5026            )
5027
5028            # Create file for field rename
5029            log.debug("Create file for field rename")
5030            tmp_rename = NamedTemporaryFile(
5031                prefix=self.get_prefix(),
5032                dir=self.get_tmp_dir(),
5033                suffix=".rename",
5034                delete=False,
5035            )
5036            tmp_rename_name = tmp_rename.name
5037            tmp_files.append(tmp_rename_name)
5038
5039            # Check Annovar database
5040            log.debug(
5041                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5042            )
5043            databases_download_annovar(
5044                folder=annovar_databases,
5045                files=list(annotations.keys()),
5046                assemblies=[assembly],
5047            )
5048
5049            for annotation in annotations:
5050                annotation_fields = annotations[annotation]
5051
5052                if not annotation_fields:
5053                    annotation_fields = {"INFO": None}
5054
5055                log.info(f"Annotations Annovar - database '{annotation}'")
5056                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5057
5058                # Tmp file for annovar
5059                err_files = []
5060                tmp_annotate_vcf_directory = TemporaryDirectory(
5061                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5062                )
5063                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5064                tmp_annotate_vcf_name_annovar = (
5065                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5066                )
5067                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5068                err_files.append(tmp_annotate_vcf_name_err)
5069                tmp_files.append(tmp_annotate_vcf_name_err)
5070
5071                # Tmp file final vcf annotated by annovar
5072                tmp_annotate_vcf = NamedTemporaryFile(
5073                    prefix=self.get_prefix(),
5074                    dir=self.get_tmp_dir(),
5075                    suffix=".vcf.gz",
5076                    delete=False,
5077                )
5078                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5079                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5080                tmp_files.append(tmp_annotate_vcf_name)
5081                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5082
5083                # Number of fields
5084                annotation_list = []
5085                annotation_renamed_list = []
5086
5087                for annotation_field in annotation_fields:
5088
5089                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5090                    annotation_fields_new_name = annotation_fields.get(
5091                        annotation_field, annotation_field
5092                    )
5093                    if not annotation_fields_new_name:
5094                        annotation_fields_new_name = annotation_field
5095
5096                    if (
5097                        force_update_annotation
5098                        or annotation_fields_new_name not in self.get_header().infos
5099                    ):
5100                        annotation_list.append(annotation_field)
5101                        annotation_renamed_list.append(annotation_fields_new_name)
5102                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5103                        log.warning(
5104                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5105                        )
5106
5107                    # Add rename info
5108                    run_parallel_commands(
5109                        [
5110                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5111                        ],
5112                        1,
5113                    )
5114
5115                # log.debug("fields_to_removed: " + str(fields_to_removed))
5116                log.debug("annotation_list: " + str(annotation_list))
5117
5118                # protocol
5119                protocol = annotation
5120
5121                # argument
5122                argument = ""
5123
5124                # operation
5125                operation = "f"
5126                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5127                    "ensGene"
5128                ):
5129                    operation = "g"
5130                    if options.get("genebase", None):
5131                        argument = f"""'{options.get("genebase","")}'"""
5132                elif annotation in ["cytoBand"]:
5133                    operation = "r"
5134
5135                # argument option
5136                argument_option = ""
5137                if argument != "":
5138                    argument_option = " --argument " + argument
5139
5140                # command options
5141                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5142                for option in options:
5143                    if option not in ["genebase"]:
5144                        command_options += f""" --{option}={options[option]}"""
5145
5146                # Command
5147
5148                # Command - Annovar
5149                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5150                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5151
5152                # Command - start pipe
5153                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5154
5155                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5156                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5157
5158                # Command - Special characters (refGene annotation)
5159                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5160
5161                # Command - Clean empty fields (with value ".")
5162                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5163
5164                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5165                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5166                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5167                    # for ann in annotation_renamed_list:
5168                    for ann in annotation_list:
5169                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5170
5171                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5172
5173                # Command - indexing
5174                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5175
5176                log.debug(f"Annotation - Annovar command: {command_annovar}")
5177                run_parallel_commands([command_annovar], 1)
5178
5179                # Error messages
5180                log.info(f"Error/Warning messages:")
5181                error_message_command_all = []
5182                error_message_command_warning = []
5183                error_message_command_err = []
5184                for err_file in err_files:
5185                    with open(err_file, "r") as f:
5186                        for line in f:
5187                            message = line.strip()
5188                            error_message_command_all.append(message)
5189                            if line.startswith("[W::") or line.startswith("WARNING"):
5190                                error_message_command_warning.append(message)
5191                            if line.startswith("[E::") or line.startswith("ERROR"):
5192                                error_message_command_err.append(
5193                                    f"{err_file}: " + message
5194                                )
5195                # log info
5196                for message in list(
5197                    set(error_message_command_err + error_message_command_warning)
5198                ):
5199                    log.info(f"   {message}")
5200                # debug info
5201                for message in list(set(error_message_command_all)):
5202                    log.debug(f"   {message}")
5203                # failed
5204                if len(error_message_command_err):
5205                    log.error("Annotation failed: Error in commands")
5206                    raise ValueError("Annotation failed: Error in commands")
5207
5208            if tmp_annotates_vcf_name_list:
5209
5210                # List of annotated files
5211                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5212
5213                # Tmp file
5214                tmp_annotate_vcf = NamedTemporaryFile(
5215                    prefix=self.get_prefix(),
5216                    dir=self.get_tmp_dir(),
5217                    suffix=".vcf.gz",
5218                    delete=False,
5219                )
5220                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5221                tmp_files.append(tmp_annotate_vcf_name)
5222                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5223                err_files.append(tmp_annotate_vcf_name_err)
5224                tmp_files.append(tmp_annotate_vcf_name_err)
5225
5226                # Command merge
5227                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5228                log.info(
5229                    f"Annotation Annovar - Annotation merging "
5230                    + str(len(tmp_annotates_vcf_name_list))
5231                    + " annotated files"
5232                )
5233                log.debug(f"Annotation - merge command: {merge_command}")
5234                run_parallel_commands([merge_command], 1)
5235
5236                # Find annotation in header
5237                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5238                    header_list = self.read_vcf_header(f)
5239                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5240
5241                for ann in annovar_vcf_header.infos:
5242                    if ann not in self.get_header().infos:
5243                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5244
5245                # Update variants
5246                log.info(f"Annotation Annovar - Updating...")
5247                self.update_from_vcf(tmp_annotate_vcf_name)
5248
5249            # Clean files
5250            # Tmp file remove command
5251            if True:
5252                tmp_files_remove_command = ""
5253                if tmp_files:
5254                    tmp_files_remove_command = " ".join(tmp_files)
5255                clean_command = f" rm -f {tmp_files_remove_command} "
5256                log.debug(f"Annotation Annovar - Annotation cleaning ")
5257                log.debug(f"Annotation - cleaning command: {clean_command}")
5258                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5261    def annotation_parquet(self, threads: int = None) -> None:
5262        """
5263        It takes a VCF file, and annotates it with a parquet file
5264
5265        :param threads: number of threads to use for the annotation
5266        :return: the value of the variable "result".
5267        """
5268
5269        # DEBUG
5270        log.debug("Start annotation with parquet databases")
5271
5272        # Threads
5273        if not threads:
5274            threads = self.get_threads()
5275        log.debug("Threads: " + str(threads))
5276
5277        # DEBUG
5278        delete_tmp = True
5279        if self.get_config().get("verbosity", "warning") in ["debug"]:
5280            delete_tmp = False
5281            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5282
5283        # Config
5284        databases_folders = set(
5285            self.get_config()
5286            .get("folders", {})
5287            .get("databases", {})
5288            .get("annotations", ["."])
5289            + self.get_config()
5290            .get("folders", {})
5291            .get("databases", {})
5292            .get("parquet", ["."])
5293        )
5294        log.debug("Databases annotations: " + str(databases_folders))
5295
5296        # Param
5297        annotations = (
5298            self.get_param()
5299            .get("annotation", {})
5300            .get("parquet", {})
5301            .get("annotations", None)
5302        )
5303        log.debug("Annotations: " + str(annotations))
5304
5305        # Assembly
5306        assembly = self.get_param().get(
5307            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5308        )
5309
5310        # Force Update Annotation
5311        force_update_annotation = (
5312            self.get_param()
5313            .get("annotation", {})
5314            .get("options", {})
5315            .get("annotations_update", False)
5316        )
5317        log.debug(f"force_update_annotation={force_update_annotation}")
5318        force_append_annotation = (
5319            self.get_param()
5320            .get("annotation", {})
5321            .get("options", {})
5322            .get("annotations_append", False)
5323        )
5324        log.debug(f"force_append_annotation={force_append_annotation}")
5325
5326        # Data
5327        table_variants = self.get_table_variants()
5328
5329        # Check if not empty
5330        log.debug("Check if not empty")
5331        sql_query_chromosomes_df = self.get_query_to_df(
5332            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5333        )
5334        if not sql_query_chromosomes_df["count"][0]:
5335            log.info(f"VCF empty")
5336            return
5337
5338        # VCF header
5339        vcf_reader = self.get_header()
5340        log.debug("Initial header: " + str(vcf_reader.infos))
5341
5342        # Nb Variants POS
5343        log.debug("NB Variants Start")
5344        nb_variants = self.conn.execute(
5345            f"SELECT count(*) AS count FROM variants"
5346        ).fetchdf()["count"][0]
5347        log.debug("NB Variants Stop")
5348
5349        # Existing annotations
5350        for vcf_annotation in self.get_header().infos:
5351
5352            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5353            log.debug(
5354                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5355            )
5356
5357        # Added columns
5358        added_columns = []
5359
5360        # drop indexes
5361        log.debug(f"Drop indexes...")
5362        self.drop_indexes()
5363
5364        if annotations:
5365
5366            if "ALL" in annotations:
5367
5368                all_param = annotations.get("ALL", {})
5369                all_param_formats = all_param.get("formats", None)
5370                all_param_releases = all_param.get("releases", None)
5371
5372                databases_infos_dict = self.scan_databases(
5373                    database_formats=all_param_formats,
5374                    database_releases=all_param_releases,
5375                )
5376                for database_infos in databases_infos_dict.keys():
5377                    if database_infos not in annotations:
5378                        annotations[database_infos] = {"INFO": None}
5379
5380            for annotation in annotations:
5381
5382                if annotation in ["ALL"]:
5383                    continue
5384
5385                # Annotation Name
5386                annotation_name = os.path.basename(annotation)
5387
5388                # Annotation fields
5389                annotation_fields = annotations[annotation]
5390                if not annotation_fields:
5391                    annotation_fields = {"INFO": None}
5392
5393                log.debug(f"Annotation '{annotation_name}'")
5394                log.debug(
5395                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5396                )
5397
5398                # Create Database
5399                database = Database(
5400                    database=annotation,
5401                    databases_folders=databases_folders,
5402                    assembly=assembly,
5403                )
5404
5405                # Find files
5406                parquet_file = database.get_database()
5407                parquet_hdr_file = database.get_header_file()
5408                parquet_type = database.get_type()
5409
5410                # Check if files exists
5411                if not parquet_file or not parquet_hdr_file:
5412                    log.error("Annotation failed: file not found")
5413                    raise ValueError("Annotation failed: file not found")
5414                else:
5415                    # Get parquet connexion
5416                    parquet_sql_attach = database.get_sql_database_attach(
5417                        output="query"
5418                    )
5419                    if parquet_sql_attach:
5420                        self.conn.execute(parquet_sql_attach)
5421                    parquet_file_link = database.get_sql_database_link()
5422                    # Log
5423                    log.debug(
5424                        f"Annotation '{annotation_name}' - file: "
5425                        + str(parquet_file)
5426                        + " and "
5427                        + str(parquet_hdr_file)
5428                    )
5429
5430                    # Database full header columns
5431                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5432                        parquet_hdr_file
5433                    )
5434                    # Log
5435                    log.debug(
5436                        "Annotation database header columns : "
5437                        + str(parquet_hdr_vcf_header_columns)
5438                    )
5439
5440                    # Load header as VCF object
5441                    parquet_hdr_vcf_header_infos = database.get_header().infos
5442                    # Log
5443                    log.debug(
5444                        "Annotation database header: "
5445                        + str(parquet_hdr_vcf_header_infos)
5446                    )
5447
5448                    # Get extra infos
5449                    parquet_columns = database.get_extra_columns()
5450                    # Log
5451                    log.debug("Annotation database Columns: " + str(parquet_columns))
5452
5453                    # Add extra columns if "ALL" in annotation_fields
5454                    # if "ALL" in annotation_fields:
5455                    #     allow_add_extra_column = True
5456                    if "ALL" in annotation_fields and database.get_extra_columns():
5457                        for extra_column in database.get_extra_columns():
5458                            if (
5459                                extra_column not in annotation_fields
5460                                and extra_column.replace("INFO/", "")
5461                                not in parquet_hdr_vcf_header_infos
5462                            ):
5463                                parquet_hdr_vcf_header_infos[extra_column] = (
5464                                    vcf.parser._Info(
5465                                        extra_column,
5466                                        ".",
5467                                        "String",
5468                                        f"{extra_column} description",
5469                                        "unknown",
5470                                        "unknown",
5471                                        self.code_type_map["String"],
5472                                    )
5473                                )
5474
5475                    # For all fields in database
5476                    annotation_fields_all = False
5477                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5478                        annotation_fields_all = True
5479                        annotation_fields = {
5480                            key: key for key in parquet_hdr_vcf_header_infos
5481                        }
5482
5483                        log.debug(
5484                            "Annotation database header - All annotations added: "
5485                            + str(annotation_fields)
5486                        )
5487
5488                    # Init
5489
5490                    # List of annotation fields to use
5491                    sql_query_annotation_update_info_sets = []
5492
5493                    # List of annotation to agregate
5494                    sql_query_annotation_to_agregate = []
5495
5496                    # Number of fields
5497                    nb_annotation_field = 0
5498
5499                    # Annotation fields processed
5500                    annotation_fields_processed = []
5501
5502                    # Columns mapping
5503                    map_columns = database.map_columns(
5504                        columns=annotation_fields, prefixes=["INFO/"]
5505                    )
5506
5507                    # Query dict for fields to remove (update option)
5508                    query_dict_remove = {}
5509
5510                    # Fetch Anotation fields
5511                    for annotation_field in annotation_fields:
5512
5513                        # annotation_field_column
5514                        annotation_field_column = map_columns.get(
5515                            annotation_field, "INFO"
5516                        )
5517
5518                        # field new name, if parametered
5519                        annotation_fields_new_name = annotation_fields.get(
5520                            annotation_field, annotation_field
5521                        )
5522                        if not annotation_fields_new_name:
5523                            annotation_fields_new_name = annotation_field
5524
5525                        # To annotate
5526                        # force_update_annotation = True
5527                        # force_append_annotation = True
5528                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5529                        if annotation_field in parquet_hdr_vcf_header_infos and (
5530                            force_update_annotation
5531                            or force_append_annotation
5532                            or (
5533                                annotation_fields_new_name
5534                                not in self.get_header().infos
5535                            )
5536                        ):
5537
5538                            # Add field to annotation to process list
5539                            annotation_fields_processed.append(
5540                                annotation_fields_new_name
5541                            )
5542
5543                            # explode infos for the field
5544                            annotation_fields_new_name_info_msg = ""
5545                            if (
5546                                force_update_annotation
5547                                and annotation_fields_new_name
5548                                in self.get_header().infos
5549                            ):
5550                                # Remove field from INFO
5551                                query = f"""
5552                                    UPDATE {table_variants} as table_variants
5553                                    SET INFO = REGEXP_REPLACE(
5554                                                concat(table_variants.INFO,''),
5555                                                ';*{annotation_fields_new_name}=[^;]*',
5556                                                ''
5557                                                )
5558                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5559                                """
5560                                annotation_fields_new_name_info_msg = " [update]"
5561                                query_dict_remove[
5562                                    f"remove 'INFO/{annotation_fields_new_name}'"
5563                                ] = query
5564
5565                            # Sep between fields in INFO
5566                            nb_annotation_field += 1
5567                            if nb_annotation_field > 1:
5568                                annotation_field_sep = ";"
5569                            else:
5570                                annotation_field_sep = ""
5571
5572                            log.info(
5573                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5574                            )
5575
5576                            # Add INFO field to header
5577                            parquet_hdr_vcf_header_infos_number = (
5578                                parquet_hdr_vcf_header_infos[annotation_field].num
5579                                or "."
5580                            )
5581                            parquet_hdr_vcf_header_infos_type = (
5582                                parquet_hdr_vcf_header_infos[annotation_field].type
5583                                or "String"
5584                            )
5585                            parquet_hdr_vcf_header_infos_description = (
5586                                parquet_hdr_vcf_header_infos[annotation_field].desc
5587                                or f"{annotation_field} description"
5588                            )
5589                            parquet_hdr_vcf_header_infos_source = (
5590                                parquet_hdr_vcf_header_infos[annotation_field].source
5591                                or "unknown"
5592                            )
5593                            parquet_hdr_vcf_header_infos_version = (
5594                                parquet_hdr_vcf_header_infos[annotation_field].version
5595                                or "unknown"
5596                            )
5597
5598                            vcf_reader.infos[annotation_fields_new_name] = (
5599                                vcf.parser._Info(
5600                                    annotation_fields_new_name,
5601                                    parquet_hdr_vcf_header_infos_number,
5602                                    parquet_hdr_vcf_header_infos_type,
5603                                    parquet_hdr_vcf_header_infos_description,
5604                                    parquet_hdr_vcf_header_infos_source,
5605                                    parquet_hdr_vcf_header_infos_version,
5606                                    self.code_type_map[
5607                                        parquet_hdr_vcf_header_infos_type
5608                                    ],
5609                                )
5610                            )
5611
5612                            # Append
5613                            if force_append_annotation:
5614                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5615                            else:
5616                                query_case_when_append = ""
5617
5618                            # Annotation/Update query fields
5619                            # Found in INFO column
5620                            if (
5621                                annotation_field_column == "INFO"
5622                                and "INFO" in parquet_hdr_vcf_header_columns
5623                            ):
5624                                sql_query_annotation_update_info_sets.append(
5625                                    f"""
5626                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5627                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5628                                        ELSE ''
5629                                    END
5630                                """
5631                                )
5632                            # Found in a specific column
5633                            else:
5634                                sql_query_annotation_update_info_sets.append(
5635                                    f"""
5636                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5637                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5638                                        ELSE ''
5639                                    END
5640                                """
5641                                )
5642                                sql_query_annotation_to_agregate.append(
5643                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5644                                )
5645
5646                        # Not to annotate
5647                        else:
5648
5649                            if force_update_annotation:
5650                                annotation_message = "forced"
5651                            else:
5652                                annotation_message = "skipped"
5653
5654                            if annotation_field not in parquet_hdr_vcf_header_infos:
5655                                log.warning(
5656                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5657                                )
5658                            if annotation_fields_new_name in self.get_header().infos:
5659                                log.warning(
5660                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5661                                )
5662
5663                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5664                    # allow_annotation_full_info = True
5665                    allow_annotation_full_info = not force_append_annotation
5666
5667                    if parquet_type in ["regions"]:
5668                        allow_annotation_full_info = False
5669
5670                    if (
5671                        allow_annotation_full_info
5672                        and nb_annotation_field == len(annotation_fields)
5673                        and annotation_fields_all
5674                        and (
5675                            "INFO" in parquet_hdr_vcf_header_columns
5676                            and "INFO" in database.get_extra_columns()
5677                        )
5678                    ):
5679                        log.debug("Column INFO annotation enabled")
5680                        sql_query_annotation_update_info_sets = []
5681                        sql_query_annotation_update_info_sets.append(
5682                            f" table_parquet.INFO "
5683                        )
5684
5685                    if sql_query_annotation_update_info_sets:
5686
5687                        # Annotate
5688                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5689
5690                        # Join query annotation update info sets for SQL
5691                        sql_query_annotation_update_info_sets_sql = ",".join(
5692                            sql_query_annotation_update_info_sets
5693                        )
5694
5695                        # Check chromosomes list (and variants infos)
5696                        sql_query_chromosomes = f"""
5697                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5698                            FROM {table_variants} as table_variants
5699                            GROUP BY table_variants."#CHROM"
5700                            ORDER BY table_variants."#CHROM"
5701                            """
5702                        sql_query_chromosomes_df = self.conn.execute(
5703                            sql_query_chromosomes
5704                        ).df()
5705                        sql_query_chromosomes_dict = {
5706                            entry["CHROM"]: {
5707                                "count": entry["count_variants"],
5708                                "min": entry["min_variants"],
5709                                "max": entry["max_variants"],
5710                            }
5711                            for index, entry in sql_query_chromosomes_df.iterrows()
5712                        }
5713
5714                        # Init
5715                        nb_of_query = 0
5716                        nb_of_variant_annotated = 0
5717                        query_dict = query_dict_remove
5718
5719                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5720                        for chrom in sql_query_chromosomes_dict:
5721
5722                            # Number of variant by chromosome
5723                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5724                                chrom, {}
5725                            ).get("count", 0)
5726
5727                            log.debug(
5728                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5729                            )
5730
5731                            # Annotation with regions database
5732                            if parquet_type in ["regions"]:
5733                                sql_query_annotation_from_clause = f"""
5734                                    FROM (
5735                                        SELECT 
5736                                            '{chrom}' AS \"#CHROM\",
5737                                            table_variants_from.\"POS\" AS \"POS\",
5738                                            {",".join(sql_query_annotation_to_agregate)}
5739                                        FROM {table_variants} as table_variants_from
5740                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5741                                            table_parquet_from."#CHROM" = '{chrom}'
5742                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5743                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5744                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5745                                                )
5746                                        )
5747                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5748                                        GROUP BY table_variants_from.\"POS\"
5749                                        )
5750                                        as table_parquet
5751                                """
5752
5753                                sql_query_annotation_where_clause = """
5754                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5755                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5756                                """
5757
5758                            # Annotation with variants database
5759                            else:
5760                                sql_query_annotation_from_clause = f"""
5761                                    FROM {parquet_file_link} as table_parquet
5762                                """
5763                                sql_query_annotation_where_clause = f"""
5764                                    table_variants."#CHROM" = '{chrom}'
5765                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5766                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5767                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5768                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5769                                """
5770
5771                            # Create update query
5772                            sql_query_annotation_chrom_interval_pos = f"""
5773                                UPDATE {table_variants} as table_variants
5774                                    SET INFO = 
5775                                        concat(
5776                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5777                                                THEN table_variants.INFO
5778                                                ELSE ''
5779                                            END
5780                                            ,
5781                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5782                                                        AND (
5783                                                        concat({sql_query_annotation_update_info_sets_sql})
5784                                                        )
5785                                                        NOT IN ('','.') 
5786                                                    THEN ';'
5787                                                    ELSE ''
5788                                            END
5789                                            ,
5790                                            {sql_query_annotation_update_info_sets_sql}
5791                                            )
5792                                    {sql_query_annotation_from_clause}
5793                                    WHERE {sql_query_annotation_where_clause}
5794                                    ;
5795                                """
5796
5797                            # Add update query to dict
5798                            query_dict[
5799                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5800                            ] = sql_query_annotation_chrom_interval_pos
5801
5802                        nb_of_query = len(query_dict)
5803                        num_query = 0
5804
5805                        # SET max_expression_depth TO x
5806                        self.conn.execute("SET max_expression_depth TO 10000")
5807
5808                        for query_name in query_dict:
5809                            query = query_dict[query_name]
5810                            num_query += 1
5811                            log.info(
5812                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5813                            )
5814                            result = self.conn.execute(query)
5815                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5816                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5817                            log.info(
5818                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5819                            )
5820
5821                        log.info(
5822                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5823                        )
5824
5825                    else:
5826
5827                        log.info(
5828                            f"Annotation '{annotation_name}' - No Annotations available"
5829                        )
5830
5831                    log.debug("Final header: " + str(vcf_reader.infos))
5832
5833        # Remove added columns
5834        for added_column in added_columns:
5835            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
5837    def annotation_splice(self, threads: int = None) -> None:
5838        """
5839        This function annotate with snpEff
5840
5841        :param threads: The number of threads to use
5842        :return: the value of the variable "return_value".
5843        """
5844
5845        # DEBUG
5846        log.debug("Start annotation with splice tools")
5847
5848        # Threads
5849        if not threads:
5850            threads = self.get_threads()
5851        log.debug("Threads: " + str(threads))
5852
5853        # DEBUG
5854        delete_tmp = True
5855        if self.get_config().get("verbosity", "warning") in ["debug"]:
5856            delete_tmp = False
5857            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5858
5859        # Config
5860        config = self.get_config()
5861        log.debug("Config: " + str(config))
5862        splice_config = config.get("tools", {}).get("splice", {})
5863        if not splice_config:
5864            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5865        if not splice_config:
5866            msg_err = "No Splice tool config"
5867            log.error(msg_err)
5868            raise ValueError(msg_err)
5869        log.debug(f"splice_config={splice_config}")
5870
5871        # Config - Folders - Databases
5872        databases_folders = (
5873            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5874        )
5875        log.debug("Databases annotations: " + str(databases_folders))
5876
5877        # Splice docker image
5878        splice_docker_image = splice_config.get("docker").get("image")
5879
5880        # Pull splice image if it's not already there
5881        if not check_docker_image_exists(splice_docker_image):
5882            log.warning(
5883                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5884            )
5885            try:
5886                command(f"docker pull {splice_config.get('docker').get('image')}")
5887            except subprocess.CalledProcessError:
5888                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5889                log.error(msg_err)
5890                raise ValueError(msg_err)
5891                return None
5892
5893        # Config - splice databases
5894        splice_databases = (
5895            config.get("folders", {})
5896            .get("databases", {})
5897            .get("splice", DEFAULT_SPLICE_FOLDER)
5898        )
5899        splice_databases = full_path(splice_databases)
5900
5901        # Param
5902        param = self.get_param()
5903        log.debug("Param: " + str(param))
5904
5905        # Param
5906        options = param.get("annotation", {}).get("splice", {})
5907        log.debug("Options: " + str(options))
5908
5909        # Data
5910        table_variants = self.get_table_variants()
5911
5912        # Check if not empty
5913        log.debug("Check if not empty")
5914        sql_query_chromosomes = (
5915            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5916        )
5917        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
5918            log.info("VCF empty")
5919            return None
5920
5921        # Export in VCF
5922        log.debug("Create initial file to annotate")
5923
5924        # Create output folder
5925        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
5926        if not os.path.exists(output_folder):
5927            Path(output_folder).mkdir(parents=True, exist_ok=True)
5928
5929        # Create tmp VCF file
5930        tmp_vcf = NamedTemporaryFile(
5931            prefix=self.get_prefix(),
5932            dir=output_folder,
5933            suffix=".vcf",
5934            delete=False,
5935        )
5936        tmp_vcf_name = tmp_vcf.name
5937
5938        # VCF header
5939        header = self.get_header()
5940
5941        # Existing annotations
5942        for vcf_annotation in self.get_header().infos:
5943
5944            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5945            log.debug(
5946                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5947            )
5948
5949        # Memory limit
5950        if config.get("memory", None):
5951            memory_limit = config.get("memory", "8G").upper()
5952            # upper()
5953        else:
5954            memory_limit = "8G"
5955        log.debug(f"memory_limit: {memory_limit}")
5956
5957        # Export VCF file
5958        self.export_variant_vcf(
5959            vcf_file=tmp_vcf_name,
5960            remove_info=True,
5961            add_samples=True,
5962            index=False,
5963        )
5964
5965        # Create docker container and launch splice analysis
5966        if splice_config:
5967
5968            # Splice mount folders
5969            mount_folders = splice_config.get("mount", {})
5970
5971            # Genome mount
5972            mount_folders[
5973                config.get("folders", {})
5974                .get("databases", {})
5975                .get("genomes", DEFAULT_GENOME_FOLDER)
5976            ] = "ro"
5977
5978            # SpliceAI mount
5979            mount_folders[
5980                config.get("folders", {})
5981                .get("databases", {})
5982                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
5983            ] = "ro"
5984
5985            # Genome mount
5986            mount_folders[
5987                config.get("folders", {})
5988                .get("databases", {})
5989                .get("spip", DEFAULT_SPIP_FOLDER)
5990            ] = "ro"
5991
5992            # Mount folders
5993            mount = []
5994
5995            # Config mount
5996            mount = [
5997                f"-v {full_path(path)}:{full_path(path)}:{mode}"
5998                for path, mode in mount_folders.items()
5999            ]
6000
6001            if any(value for value in splice_config.values() if value is None):
6002                log.warning("At least one splice config parameter is empty")
6003                return None
6004
6005            # Params in splice nf
6006            def check_values(dico: dict):
6007                """
6008                Ensure parameters for NF splice pipeline
6009                """
6010                for key, val in dico.items():
6011                    if key == "genome":
6012                        if any(
6013                            assemb in options.get("genome", {})
6014                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6015                        ):
6016                            yield f"--{key} hg19"
6017                        elif any(
6018                            assemb in options.get("genome", {})
6019                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6020                        ):
6021                            yield f"--{key} hg38"
6022                    elif (
6023                        (isinstance(val, str) and val)
6024                        or isinstance(val, int)
6025                        or isinstance(val, bool)
6026                    ):
6027                        yield f"--{key} {val}"
6028
6029            # Genome
6030            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6031            options["genome"] = genome
6032
6033            # NF params
6034            nf_params = []
6035
6036            # Add options
6037            if options:
6038                nf_params = list(check_values(options))
6039                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6040            else:
6041                log.debug("No NF params provided")
6042
6043            # Add threads
6044            if "threads" not in options.keys():
6045                nf_params.append(f"--threads {threads}")
6046
6047            # Genome path
6048            genome_path = find_genome(
6049                config.get("folders", {})
6050                .get("databases", {})
6051                .get("genomes", DEFAULT_GENOME_FOLDER),
6052                file=f"{genome}.fa",
6053            )
6054            # Add genome path
6055            if not genome_path:
6056                raise ValueError(
6057                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6058                )
6059            else:
6060                log.debug(f"Genome: {genome_path}")
6061                nf_params.append(f"--genome_path {genome_path}")
6062
6063            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6064                """
6065                Setting up updated databases for SPiP and SpliceAI
6066                """
6067
6068                try:
6069
6070                    # SpliceAI assembly transcriptome
6071                    spliceai_assembly = os.path.join(
6072                        config.get("folders", {})
6073                        .get("databases", {})
6074                        .get("spliceai", {}),
6075                        options.get("genome"),
6076                        "transcriptome",
6077                    )
6078                    spip_assembly = options.get("genome")
6079
6080                    spip = find(
6081                        f"transcriptome_{spip_assembly}.RData",
6082                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6083                    )
6084                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6085                    log.debug(f"SPiP annotations: {spip}")
6086                    log.debug(f"SpliceAI annotations: {spliceai}")
6087                    if spip and spliceai:
6088                        return [
6089                            f"--spip_transcriptome {spip}",
6090                            f"--spliceai_annotations {spliceai}",
6091                        ]
6092                    else:
6093                        # TODO crash and go on with basic annotations ?
6094                        # raise ValueError(
6095                        #     "Can't find splice databases in configuration EXIT"
6096                        # )
6097                        log.warning(
6098                            "Can't find splice databases in configuration, use annotations file from image"
6099                        )
6100                except TypeError:
6101                    log.warning(
6102                        "Can't find splice databases in configuration, use annotations file from image"
6103                    )
6104                    return []
6105
6106            # Add options, check if transcriptome option have already beend provided
6107            if (
6108                "spip_transcriptome" not in nf_params
6109                and "spliceai_transcriptome" not in nf_params
6110            ):
6111                splice_reference = splice_annotations(options, config)
6112                if splice_reference:
6113                    nf_params.extend(splice_reference)
6114
6115            nf_params.append(f"--output_folder {output_folder}")
6116
6117            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6118            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6119            log.debug(cmd)
6120
6121            splice_config["docker"]["command"] = cmd
6122
6123            docker_cmd = get_bin_command(
6124                tool="splice",
6125                bin_type="docker",
6126                config=config,
6127                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6128                add_options=f"--name {random_uuid} {' '.join(mount)}",
6129            )
6130
6131            # Docker debug
6132            # if splice_config.get("rm_container"):
6133            #     rm_container = "--rm"
6134            # else:
6135            #     rm_container = ""
6136            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6137
6138            log.debug(docker_cmd)
6139            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6140            log.debug(res.stdout)
6141            if res.stderr:
6142                log.error(res.stderr)
6143            res.check_returncode()
6144        else:
6145            log.warning(f"Splice tool configuration not found: {config}")
6146
6147        # Update variants
6148        log.info("Annotation - Updating...")
6149        # Test find output vcf
6150        log.debug(
6151            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6152        )
6153        output_vcf = []
6154        # Wrong folder to look in
6155        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6156            if (
6157                files
6158                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6159            ):
6160                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6161        # log.debug(os.listdir(options.get("output_folder")))
6162        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6163        if not output_vcf:
6164            log.debug(
6165                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6166            )
6167        else:
6168            # Get new header from annotated vcf
6169            log.debug(f"Initial header: {len(header.infos)} fields")
6170            # Create new header with splice infos
6171            new_vcf = Variants(input=output_vcf[0])
6172            new_vcf_header = new_vcf.get_header().infos
6173            for keys, infos in new_vcf_header.items():
6174                if keys not in header.infos.keys():
6175                    header.infos[keys] = infos
6176            log.debug(f"New header: {len(header.infos)} fields")
6177            log.debug(f"Splice tmp output: {output_vcf[0]}")
6178            self.update_from_vcf(output_vcf[0])
6179
6180        # Remove folder
6181        remove_if_exists(output_folder)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6187    def get_config_default(self, name: str) -> dict:
6188        """
6189        The function `get_config_default` returns a dictionary containing default configurations for
6190        various calculations and prioritizations.
6191
6192        :param name: The `get_config_default` function returns a dictionary containing default
6193        configurations for different calculations and prioritizations. The `name` parameter is used to
6194        specify which specific configuration to retrieve from the dictionary
6195        :type name: str
6196        :return: The function `get_config_default` returns a dictionary containing default configuration
6197        settings for different calculations and prioritizations. The specific configuration settings are
6198        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6199        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6200        returned. If there is no match, an empty dictionary is returned.
6201        """
6202
6203        config_default = {
6204            "calculations": {
6205                "variant_chr_pos_alt_ref": {
6206                    "type": "sql",
6207                    "name": "variant_chr_pos_alt_ref",
6208                    "description": "Create a variant ID with chromosome, position, alt and ref",
6209                    "available": False,
6210                    "output_column_name": "variant_chr_pos_alt_ref",
6211                    "output_column_type": "String",
6212                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6213                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6214                    "operation_info": True,
6215                },
6216                "VARTYPE": {
6217                    "type": "sql",
6218                    "name": "VARTYPE",
6219                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6220                    "available": True,
6221                    "output_column_name": "VARTYPE",
6222                    "output_column_type": "String",
6223                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6224                    "operation_query": """
6225                            CASE
6226                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6227                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6228                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6229                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6230                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6231                                ELSE 'UNDEFINED'
6232                            END
6233                            """,
6234                    "info_fields": ["SVTYPE"],
6235                    "operation_info": True,
6236                },
6237                "snpeff_hgvs": {
6238                    "type": "python",
6239                    "name": "snpeff_hgvs",
6240                    "description": "HGVS nomenclatures from snpEff annotation",
6241                    "available": True,
6242                    "function_name": "calculation_extract_snpeff_hgvs",
6243                    "function_params": [],
6244                },
6245                "NOMEN": {
6246                    "type": "python",
6247                    "name": "NOMEN",
6248                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6249                    "available": True,
6250                    "function_name": "calculation_extract_nomen",
6251                    "function_params": [],
6252                },
6253                "FINDBYPIPELINE": {
6254                    "type": "python",
6255                    "name": "FINDBYPIPELINE",
6256                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6257                    "available": True,
6258                    "function_name": "calculation_find_by_pipeline",
6259                    "function_params": ["findbypipeline"],
6260                },
6261                "FINDBYSAMPLE": {
6262                    "type": "python",
6263                    "name": "FINDBYSAMPLE",
6264                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6265                    "available": True,
6266                    "function_name": "calculation_find_by_pipeline",
6267                    "function_params": ["findbysample"],
6268                },
6269                "GENOTYPECONCORDANCE": {
6270                    "type": "python",
6271                    "name": "GENOTYPECONCORDANCE",
6272                    "description": "Concordance of genotype for multi caller VCF",
6273                    "available": True,
6274                    "function_name": "calculation_genotype_concordance",
6275                    "function_params": [],
6276                },
6277                "BARCODE": {
6278                    "type": "python",
6279                    "name": "BARCODE",
6280                    "description": "BARCODE as VaRank tool",
6281                    "available": True,
6282                    "function_name": "calculation_barcode",
6283                    "function_params": [],
6284                },
6285                "BARCODEFAMILY": {
6286                    "type": "python",
6287                    "name": "BARCODEFAMILY",
6288                    "description": "BARCODEFAMILY as VaRank tool",
6289                    "available": True,
6290                    "function_name": "calculation_barcode_family",
6291                    "function_params": ["BCF"],
6292                },
6293                "TRIO": {
6294                    "type": "python",
6295                    "name": "TRIO",
6296                    "description": "Inheritance for a trio family",
6297                    "available": True,
6298                    "function_name": "calculation_trio",
6299                    "function_params": [],
6300                },
6301                "VAF": {
6302                    "type": "python",
6303                    "name": "VAF",
6304                    "description": "Variant Allele Frequency (VAF) harmonization",
6305                    "available": True,
6306                    "function_name": "calculation_vaf_normalization",
6307                    "function_params": [],
6308                },
6309                "VAF_stats": {
6310                    "type": "python",
6311                    "name": "VAF_stats",
6312                    "description": "Variant Allele Frequency (VAF) statistics",
6313                    "available": True,
6314                    "function_name": "calculation_genotype_stats",
6315                    "function_params": ["VAF"],
6316                },
6317                "DP_stats": {
6318                    "type": "python",
6319                    "name": "DP_stats",
6320                    "description": "Depth (DP) statistics",
6321                    "available": True,
6322                    "function_name": "calculation_genotype_stats",
6323                    "function_params": ["DP"],
6324                },
6325                "variant_id": {
6326                    "type": "python",
6327                    "name": "variant_id",
6328                    "description": "Variant ID generated from variant position and type",
6329                    "available": True,
6330                    "function_name": "calculation_variant_id",
6331                    "function_params": [],
6332                },
6333            },
6334            "prioritizations": {
6335                "default": {
6336                    "filter": [
6337                        {
6338                            "type": "notequals",
6339                            "value": "!PASS|\\.",
6340                            "score": 0,
6341                            "flag": "FILTERED",
6342                            "comment": ["Bad variant quality"],
6343                        },
6344                        {
6345                            "type": "equals",
6346                            "value": "REJECT",
6347                            "score": -20,
6348                            "flag": "PASS",
6349                            "comment": ["Bad variant quality"],
6350                        },
6351                    ],
6352                    "DP": [
6353                        {
6354                            "type": "gte",
6355                            "value": "50",
6356                            "score": 5,
6357                            "flag": "PASS",
6358                            "comment": ["DP higher than 50"],
6359                        }
6360                    ],
6361                    "ANN": [
6362                        {
6363                            "type": "contains",
6364                            "value": "HIGH",
6365                            "score": 5,
6366                            "flag": "PASS",
6367                            "comment": [
6368                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6369                            ],
6370                        },
6371                        {
6372                            "type": "contains",
6373                            "value": "MODERATE",
6374                            "score": 3,
6375                            "flag": "PASS",
6376                            "comment": [
6377                                "A non-disruptive variant that might change protein effectiveness"
6378                            ],
6379                        },
6380                        {
6381                            "type": "contains",
6382                            "value": "LOW",
6383                            "score": 0,
6384                            "flag": "FILTERED",
6385                            "comment": [
6386                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6387                            ],
6388                        },
6389                        {
6390                            "type": "contains",
6391                            "value": "MODIFIER",
6392                            "score": 0,
6393                            "flag": "FILTERED",
6394                            "comment": [
6395                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6396                            ],
6397                        },
6398                    ],
6399                }
6400            },
6401        }
6402
6403        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6405    def get_config_json(
6406        self, name: str, config_dict: dict = {}, config_file: str = None
6407    ) -> dict:
6408        """
6409        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6410        default values, a dictionary, and a file.
6411
6412        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6413        the name of the configuration. It is used to identify and retrieve the configuration settings
6414        for a specific component or module
6415        :type name: str
6416        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6417        dictionary that allows you to provide additional configuration settings or overrides. When you
6418        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6419        the key is the configuration setting you want to override or
6420        :type config_dict: dict
6421        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6422        specify the path to a configuration file that contains additional settings. If provided, the
6423        function will read the contents of this file and update the configuration dictionary with the
6424        values found in the file, overriding any existing values with the
6425        :type config_file: str
6426        :return: The function `get_config_json` returns a dictionary containing the configuration
6427        settings.
6428        """
6429
6430        # Create with default prioritizations
6431        config_default = self.get_config_default(name=name)
6432        configuration = config_default
6433        # log.debug(f"configuration={configuration}")
6434
6435        # Replace prioritizations from dict
6436        for config in config_dict:
6437            configuration[config] = config_dict[config]
6438
6439        # Replace prioritizations from file
6440        config_file = full_path(config_file)
6441        if config_file:
6442            if os.path.exists(config_file):
6443                with open(config_file) as config_file_content:
6444                    config_file_dict = json.load(config_file_content)
6445                for config in config_file_dict:
6446                    configuration[config] = config_file_dict[config]
6447            else:
6448                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6449                log.error(msg_error)
6450                raise ValueError(msg_error)
6451
6452        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization(self) -> None:
6477    def prioritization(self) -> None:
6478        """
6479        It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other
6480        INFO fields
6481        """
6482
6483        # Config
6484        config = self.get_config()
6485
6486        # Param
6487        param = self.get_param()
6488
6489        # Quick Prioritizations
6490        # prioritizations = param.get("prioritization", {}).get("prioritizations", "")
6491
6492        # Configuration profiles
6493        prioritization_config_file = param.get("prioritization", {}).get(
6494            "prioritization_config", None
6495        )
6496        prioritization_config_file = full_path(prioritization_config_file)
6497        prioritizations_config = self.get_config_json(
6498            name="prioritizations", config_file=prioritization_config_file
6499        )
6500
6501        # Prioritization options
6502        profiles = param.get("prioritization", {}).get("profiles", [])
6503        if isinstance(profiles, str):
6504            profiles = profiles.split(",")
6505        pzfields = param.get("prioritization", {}).get(
6506            "pzfields", ["PZFlag", "PZScore"]
6507        )
6508        if isinstance(pzfields, str):
6509            pzfields = pzfields.split(",")
6510        default_profile = param.get("prioritization", {}).get("default_profile", None)
6511        pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_")
6512        prioritization_score_mode = param.get("prioritization", {}).get(
6513            "prioritization_score_mode", "HOWARD"
6514        )
6515
6516        # Quick Prioritizations
6517        # prioritizations = param.get("prioritization", {}).get("prioritizations", None)
6518        prioritizations = param.get("prioritizations", None)
6519        if prioritizations:
6520            log.info("Quick Prioritization:")
6521            for profile in prioritizations.split(","):
6522                if profile not in profiles:
6523                    profiles.append(profile)
6524                    log.info(f"   {profile}")
6525
6526        # If profile "ALL" provided, all profiles in the config profiles
6527        if "ALL" in profiles:
6528            profiles = list(prioritizations_config.keys())
6529
6530        for profile in profiles:
6531            if prioritizations_config.get(profile, None):
6532                log.debug(f"Profile '{profile}' configured")
6533            else:
6534                msg_error = f"Profile '{profile}' NOT configured"
6535                log.error(msg_error)
6536                raise ValueError(msg_error)
6537
6538        if profiles:
6539            log.info(f"Prioritization... ")
6540        else:
6541            log.debug(f"No profile defined")
6542            return
6543
6544        if not default_profile and len(profiles):
6545            default_profile = profiles[0]
6546
6547        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6548        log.debug("Profiles to check: " + str(list(profiles)))
6549
6550        # Variables
6551        table_variants = self.get_table_variants(clause="update")
6552
6553        # Added columns
6554        added_columns = []
6555
6556        # Create list of PZfields
6557        # List of PZFields
6558        list_of_pzfields_original = pzfields + [
6559            pzfield + pzfields_sep + profile
6560            for pzfield in pzfields
6561            for profile in profiles
6562        ]
6563        list_of_pzfields = []
6564        log.debug(f"{list_of_pzfields_original}")
6565
6566        # Remove existing PZfields to use if exists
6567        for pzfield in list_of_pzfields_original:
6568            if self.get_header().infos.get(pzfield, None) is None:
6569                list_of_pzfields.append(pzfield)
6570                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6571            else:
6572                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6573
6574        if list_of_pzfields:
6575
6576            # Explode Infos fields
6577            explode_infos_prefix = self.get_explode_infos_prefix()
6578            added_columns += self.explode_infos(prefix=explode_infos_prefix)
6579            extra_infos = self.get_extra_infos()
6580
6581            # PZfields tags description
6582            PZfields_INFOS = {
6583                "PZTags": {
6584                    "ID": "PZTags",
6585                    "Number": ".",
6586                    "Type": "String",
6587                    "Description": "Variant tags based on annotation criteria",
6588                },
6589                "PZScore": {
6590                    "ID": "PZScore",
6591                    "Number": 1,
6592                    "Type": "Integer",
6593                    "Description": "Variant score based on annotation criteria",
6594                },
6595                "PZFlag": {
6596                    "ID": "PZFlag",
6597                    "Number": 1,
6598                    "Type": "String",
6599                    "Description": "Variant flag based on annotation criteria",
6600                },
6601                "PZComment": {
6602                    "ID": "PZComment",
6603                    "Number": ".",
6604                    "Type": "String",
6605                    "Description": "Variant comment based on annotation criteria",
6606                },
6607                "PZInfos": {
6608                    "ID": "PZInfos",
6609                    "Number": ".",
6610                    "Type": "String",
6611                    "Description": "Variant infos based on annotation criteria",
6612                },
6613            }
6614
6615            # Create INFO fields if not exist
6616            for field in PZfields_INFOS:
6617                field_ID = PZfields_INFOS[field]["ID"]
6618                field_description = PZfields_INFOS[field]["Description"]
6619                if field_ID not in self.get_header().infos and field_ID in pzfields:
6620                    field_description = (
6621                        PZfields_INFOS[field]["Description"]
6622                        + f", profile {default_profile}"
6623                    )
6624                    self.get_header().infos[field_ID] = vcf.parser._Info(
6625                        field_ID,
6626                        PZfields_INFOS[field]["Number"],
6627                        PZfields_INFOS[field]["Type"],
6628                        field_description,
6629                        "unknown",
6630                        "unknown",
6631                        code_type_map[PZfields_INFOS[field]["Type"]],
6632                    )
6633
6634            # Create INFO fields if not exist for each profile
6635            for profile in prioritizations_config:
6636                if profile in profiles or profiles == []:
6637                    for field in PZfields_INFOS:
6638                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6639                        field_description = (
6640                            PZfields_INFOS[field]["Description"]
6641                            + f", profile {profile}"
6642                        )
6643                        if (
6644                            field_ID not in self.get_header().infos
6645                            and field in pzfields
6646                        ):
6647                            self.get_header().infos[field_ID] = vcf.parser._Info(
6648                                field_ID,
6649                                PZfields_INFOS[field]["Number"],
6650                                PZfields_INFOS[field]["Type"],
6651                                field_description,
6652                                "unknown",
6653                                "unknown",
6654                                code_type_map[PZfields_INFOS[field]["Type"]],
6655                            )
6656
6657            # Header
6658            for pzfield in list_of_pzfields:
6659                if re.match("PZScore.*", pzfield):
6660                    added_column = self.add_column(
6661                        table_name=table_variants,
6662                        column_name=pzfield,
6663                        column_type="INTEGER",
6664                        default_value="0",
6665                    )
6666                elif re.match("PZFlag.*", pzfield):
6667                    added_column = self.add_column(
6668                        table_name=table_variants,
6669                        column_name=pzfield,
6670                        column_type="BOOLEAN",
6671                        default_value="1",
6672                    )
6673                else:
6674                    added_column = self.add_column(
6675                        table_name=table_variants,
6676                        column_name=pzfield,
6677                        column_type="STRING",
6678                        default_value="''",
6679                    )
6680                added_columns.append(added_column)
6681
6682            # Profiles
6683            if profiles:
6684
6685                # foreach profile in configuration file
6686                for profile in prioritizations_config:
6687
6688                    # If profile is asked in param, or ALL are asked (empty profile [])
6689                    if profile in profiles or profiles == []:
6690                        log.info(f"Profile '{profile}'")
6691
6692                        sql_set_info_option = ""
6693
6694                        sql_set_info = []
6695
6696                        # PZ fields set
6697
6698                        # PZScore
6699                        if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields:
6700                            sql_set_info.append(
6701                                f"""
6702                                    concat(
6703                                        'PZScore{pzfields_sep}{profile}=',
6704                                        PZScore{pzfields_sep}{profile}
6705                                    ) 
6706                                """
6707                            )
6708                            if (
6709                                profile == default_profile
6710                                and "PZScore" in list_of_pzfields
6711                            ):
6712                                sql_set_info.append(
6713                                    f"""
6714                                        concat(
6715                                            'PZScore=',
6716                                            PZScore{pzfields_sep}{profile}
6717                                        )
6718                                    """
6719                                )
6720
6721                        # PZFlag
6722                        if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6723                            sql_set_info.append(
6724                                f"""
6725                                    concat(
6726                                        'PZFlag{pzfields_sep}{profile}=',
6727                                        CASE 
6728                                            WHEN PZFlag{pzfields_sep}{profile}==1
6729                                            THEN 'PASS'
6730                                            WHEN PZFlag{pzfields_sep}{profile}==0
6731                                            THEN 'FILTERED'
6732                                        END
6733                                    ) 
6734                                """
6735                            )
6736                            if (
6737                                profile == default_profile
6738                                and "PZFlag" in list_of_pzfields
6739                            ):
6740                                sql_set_info.append(
6741                                    f"""
6742                                        concat(
6743                                            'PZFlag=',
6744                                            CASE 
6745                                                WHEN PZFlag{pzfields_sep}{profile}==1
6746                                                THEN 'PASS'
6747                                                WHEN PZFlag{pzfields_sep}{profile}==0
6748                                                THEN 'FILTERED'
6749                                            END
6750                                        )
6751                                    """
6752                                )
6753
6754                        # PZComment
6755                        if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields:
6756                            sql_set_info.append(
6757                                f"""
6758                                    CASE
6759                                        WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6760                                        THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile})
6761                                        ELSE ''
6762                                    END
6763                                """
6764                            )
6765                            if (
6766                                profile == default_profile
6767                                and "PZComment" in list_of_pzfields
6768                            ):
6769                                sql_set_info.append(
6770                                    f"""
6771                                        CASE
6772                                            WHEN PZComment{pzfields_sep}{profile} NOT IN ('')
6773                                            THEN concat('PZComment=', PZComment{pzfields_sep}{profile})
6774                                            ELSE ''
6775                                        END
6776                                    """
6777                                )
6778
6779                        # PZInfos
6780                        if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields:
6781                            sql_set_info.append(
6782                                f"""
6783                                    CASE
6784                                        WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6785                                        THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile})
6786                                        ELSE ''
6787                                    END
6788                                """
6789                            )
6790                            if (
6791                                profile == default_profile
6792                                and "PZInfos" in list_of_pzfields
6793                            ):
6794                                sql_set_info.append(
6795                                    f"""
6796                                        CASE
6797                                            WHEN PZInfos{pzfields_sep}{profile} NOT IN ('')
6798                                            THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile})
6799                                            ELSE ''
6800                                        END
6801                                    """
6802                                )
6803
6804                        # Merge PZfields
6805                        sql_set_info_option = ""
6806                        sql_set_sep = ""
6807                        for sql_set in sql_set_info:
6808                            if sql_set_sep:
6809                                sql_set_info_option += f"""
6810                                    , concat('{sql_set_sep}', {sql_set})
6811                                """
6812                            else:
6813                                sql_set_info_option += f"""
6814                                    , {sql_set}
6815                                """
6816                            sql_set_sep = ";"
6817
6818                        sql_queries = []
6819                        for annotation in prioritizations_config[profile]:
6820
6821                            # Check if annotation field is present
6822                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
6823                                log.debug(f"Annotation '{annotation}' not in data")
6824                                continue
6825                            else:
6826                                log.debug(f"Annotation '{annotation}' in data")
6827
6828                            # For each criterions
6829                            for criterion in prioritizations_config[profile][
6830                                annotation
6831                            ]:
6832                                criterion_type = criterion["type"]
6833                                criterion_value = criterion["value"]
6834                                criterion_score = criterion.get("score", 0)
6835                                criterion_flag = criterion.get("flag", "PASS")
6836                                criterion_flag_bool = criterion_flag == "PASS"
6837                                criterion_comment = (
6838                                    ", ".join(criterion.get("comment", []))
6839                                    .replace("'", "''")
6840                                    .replace(";", ",")
6841                                    .replace("\t", " ")
6842                                )
6843                                criterion_infos = (
6844                                    str(criterion)
6845                                    .replace("'", "''")
6846                                    .replace(";", ",")
6847                                    .replace("\t", " ")
6848                                )
6849
6850                                sql_set = []
6851                                sql_set_info = []
6852
6853                                # PZ fields set
6854                                if (
6855                                    f"PZScore{pzfields_sep}{profile}"
6856                                    in list_of_pzfields
6857                                ):
6858                                    if prioritization_score_mode == "HOWARD":
6859                                        sql_set.append(
6860                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6861                                        )
6862                                    elif prioritization_score_mode == "VaRank":
6863                                        sql_set.append(
6864                                            f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END"
6865                                        )
6866                                    else:
6867                                        sql_set.append(
6868                                            f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}"
6869                                        )
6870                                if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields:
6871                                    sql_set.append(
6872                                        f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}"
6873                                    )
6874                                if (
6875                                    f"PZComment{pzfields_sep}{profile}"
6876                                    in list_of_pzfields
6877                                ):
6878                                    sql_set.append(
6879                                        f"""
6880                                            PZComment{pzfields_sep}{profile} = 
6881                                                concat(
6882                                                    PZComment{pzfields_sep}{profile},
6883                                                    CASE 
6884                                                        WHEN PZComment{pzfields_sep}{profile}!=''
6885                                                        THEN ', '
6886                                                        ELSE ''
6887                                                    END,
6888                                                    '{criterion_comment}'
6889                                                )
6890                                        """
6891                                    )
6892                                if (
6893                                    f"PZInfos{pzfields_sep}{profile}"
6894                                    in list_of_pzfields
6895                                ):
6896                                    sql_set.append(
6897                                        f"""
6898                                            PZInfos{pzfields_sep}{profile} = 
6899                                                concat(
6900                                                    PZInfos{pzfields_sep}{profile},
6901                                                    '{criterion_infos}'
6902                                                )
6903                                        """
6904                                    )
6905                                sql_set_option = ",".join(sql_set)
6906
6907                                # Criterion and comparison
6908                                try:
6909                                    float(criterion_value)
6910                                    sql_update = f"""
6911                                        UPDATE {table_variants}
6912                                        SET {sql_set_option}
6913                                        WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
6914                                        AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value}
6915                                        """
6916                                except:
6917                                    contains_option = ""
6918                                    if criterion_type == "contains":
6919                                        contains_option = ".*"
6920                                    sql_update = f"""
6921                                        UPDATE {table_variants}
6922                                        SET {sql_set_option}
6923                                        WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
6924                                        """
6925                                sql_queries.append(sql_update)
6926
6927                        # PZTags
6928                        if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields:
6929
6930                            # Create PZFalgs value
6931                            pztags_value = ""
6932                            pztags_sep_default = "|"
6933                            pztags_sep = ""
6934                            for pzfield in pzfields:
6935                                if pzfield not in ["PZTags"]:
6936                                    if (
6937                                        f"{pzfield}{pzfields_sep}{profile}"
6938                                        in list_of_pzfields
6939                                    ):
6940                                        if pzfield in ["PZFlag"]:
6941                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
6942                                                CASE WHEN PZFlag{pzfields_sep}{profile}
6943                                                    THEN 'PASS'
6944                                                    ELSE 'FILTERED'
6945                                                END, '"""
6946                                        else:
6947                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
6948                                        pztags_sep = pztags_sep_default
6949
6950                            # Add Query update for PZFlags
6951                            sql_update_pztags = f"""
6952                                UPDATE {table_variants}
6953                                SET INFO = concat(
6954                                        INFO,
6955                                        CASE WHEN INFO NOT in ('','.')
6956                                                THEN ';'
6957                                                ELSE ''
6958                                        END,
6959                                        'PZTags{pzfields_sep}{profile}={pztags_value}'
6960                                    )
6961                                """
6962                            sql_queries.append(sql_update_pztags)
6963
6964                            # Add Query update for PZFlags for default
6965                            if profile == default_profile:
6966                                sql_update_pztags_default = f"""
6967                                UPDATE {table_variants}
6968                                SET INFO = concat(
6969                                        INFO,
6970                                        ';',
6971                                        'PZTags={pztags_value}'
6972                                    )
6973                                """
6974                                sql_queries.append(sql_update_pztags_default)
6975
6976                        log.info(f"""Profile '{profile}' - Prioritization... """)
6977
6978                        if sql_queries:
6979
6980                            for sql_query in sql_queries:
6981                                log.debug(
6982                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
6983                                )
6984                                self.conn.execute(sql_query)
6985
6986                        log.info(f"""Profile '{profile}' - Update... """)
6987                        sql_query_update = f"""
6988                            UPDATE {table_variants}
6989                            SET INFO =  
6990                                concat(
6991                                    CASE
6992                                        WHEN INFO NOT IN ('','.')
6993                                        THEN concat(INFO, ';')
6994                                        ELSE ''
6995                                    END
6996                                    {sql_set_info_option}
6997                                )
6998                        """
6999                        self.conn.execute(sql_query_update)
7000
7001        else:
7002
7003            log.warning(f"No profiles in parameters")
7004
7005        # Remove added columns
7006        for added_column in added_columns:
7007            self.drop_column(column=added_column)
7008
7009        # Explode INFOS fields into table fields
7010        if self.get_explode_infos():
7011            self.explode_infos(
7012                prefix=self.get_explode_infos_prefix(),
7013                fields=self.get_explode_infos_fields(),
7014                force=True,
7015            )
7016
7017        return

It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other INFO fields

def annotation_hgvs(self, threads: int = None) -> None:
7023    def annotation_hgvs(self, threads: int = None) -> None:
7024        """
7025        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7026        coordinates and alleles.
7027
7028        :param threads: The `threads` parameter is an optional integer that specifies the number of
7029        threads to use for parallel processing. If no value is provided, it will default to the number
7030        of threads obtained from the `get_threads()` method
7031        :type threads: int
7032        """
7033
7034        # Function for each partition of the Dask Dataframe
7035        def partition_function(partition):
7036            """
7037            The function `partition_function` applies the `annotation_hgvs_partition` function to
7038            each row of a DataFrame called `partition`.
7039
7040            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7041            to be processed
7042            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7043            the "partition" dataframe along the axis 1.
7044            """
7045            return partition.apply(annotation_hgvs_partition, axis=1)
7046
7047        def annotation_hgvs_partition(row) -> str:
7048            """
7049            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7050            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7051
7052            :param row: A dictionary-like object that contains the values for the following keys:
7053            :return: a string that contains the HGVS names associated with the given row of data.
7054            """
7055
7056            chr = row["CHROM"]
7057            pos = row["POS"]
7058            ref = row["REF"]
7059            alt = row["ALT"]
7060
7061            # Find list of associated transcripts
7062            transcripts_list = list(
7063                polars_conn.execute(
7064                    f"""
7065                SELECT transcript
7066                FROM refseq_df
7067                WHERE CHROM='{chr}'
7068                AND POS={pos}
7069            """
7070                )["transcript"]
7071            )
7072
7073            # Full HGVS annotation in list
7074            hgvs_full_list = []
7075
7076            for transcript_name in transcripts_list:
7077
7078                # Transcript
7079                transcript = get_transcript(
7080                    transcripts=transcripts, transcript_name=transcript_name
7081                )
7082                # Exon
7083                if use_exon:
7084                    exon = transcript.find_exon_number(pos)
7085                else:
7086                    exon = None
7087                # Protein
7088                transcript_protein = None
7089                if use_protein or add_protein or full_format:
7090                    transcripts_protein = list(
7091                        polars_conn.execute(
7092                            f"""
7093                        SELECT protein
7094                        FROM refseqlink_df
7095                        WHERE transcript='{transcript_name}'
7096                        LIMIT 1
7097                    """
7098                        )["protein"]
7099                    )
7100                    if len(transcripts_protein):
7101                        transcript_protein = transcripts_protein[0]
7102
7103                # HGVS name
7104                hgvs_name = format_hgvs_name(
7105                    chr,
7106                    pos,
7107                    ref,
7108                    alt,
7109                    genome=genome,
7110                    transcript=transcript,
7111                    transcript_protein=transcript_protein,
7112                    exon=exon,
7113                    use_gene=use_gene,
7114                    use_protein=use_protein,
7115                    full_format=full_format,
7116                    use_version=use_version,
7117                    codon_type=codon_type,
7118                )
7119                hgvs_full_list.append(hgvs_name)
7120                if add_protein and not use_protein and not full_format:
7121                    hgvs_name = format_hgvs_name(
7122                        chr,
7123                        pos,
7124                        ref,
7125                        alt,
7126                        genome=genome,
7127                        transcript=transcript,
7128                        transcript_protein=transcript_protein,
7129                        exon=exon,
7130                        use_gene=use_gene,
7131                        use_protein=True,
7132                        full_format=False,
7133                        use_version=use_version,
7134                        codon_type=codon_type,
7135                    )
7136                    hgvs_full_list.append(hgvs_name)
7137
7138            # Create liste of HGVS annotations
7139            hgvs_full = ",".join(hgvs_full_list)
7140
7141            return hgvs_full
7142
7143        # Polars connexion
7144        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7145
7146        # Config
7147        config = self.get_config()
7148
7149        # Databases
7150        # Genome
7151        databases_genomes_folders = (
7152            config.get("folders", {})
7153            .get("databases", {})
7154            .get("genomes", DEFAULT_GENOME_FOLDER)
7155        )
7156        databases_genome = (
7157            config.get("folders", {}).get("databases", {}).get("genomes", "")
7158        )
7159        # refseq database folder
7160        databases_refseq_folders = (
7161            config.get("folders", {})
7162            .get("databases", {})
7163            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7164        )
7165        # refseq
7166        databases_refseq = config.get("databases", {}).get("refSeq", None)
7167        # refSeqLink
7168        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7169
7170        # Param
7171        param = self.get_param()
7172
7173        # Quick HGVS
7174        if "hgvs_options" in param and param.get("hgvs_options", ""):
7175            log.info(f"Quick HGVS Annotation:")
7176            if not param.get("hgvs", None):
7177                param["hgvs"] = {}
7178            for option in param.get("hgvs_options", "").split(","):
7179                option_var_val = option.split("=")
7180                option_var = option_var_val[0]
7181                if len(option_var_val) > 1:
7182                    option_val = option_var_val[1]
7183                else:
7184                    option_val = "True"
7185                if option_val.upper() in ["TRUE"]:
7186                    option_val = True
7187                elif option_val.upper() in ["FALSE"]:
7188                    option_val = False
7189                log.info(f"   {option_var}={option_val}")
7190                param["hgvs"][option_var] = option_val
7191
7192        # Check if HGVS annotation enabled
7193        if "hgvs" in param:
7194            log.info(f"HGVS Annotation... ")
7195            for hgvs_option in param.get("hgvs", {}):
7196                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7197        else:
7198            return
7199
7200        # HGVS Param
7201        param_hgvs = param.get("hgvs", {})
7202        use_exon = param_hgvs.get("use_exon", False)
7203        use_gene = param_hgvs.get("use_gene", False)
7204        use_protein = param_hgvs.get("use_protein", False)
7205        add_protein = param_hgvs.get("add_protein", False)
7206        full_format = param_hgvs.get("full_format", False)
7207        use_version = param_hgvs.get("use_version", False)
7208        codon_type = param_hgvs.get("codon_type", "3")
7209
7210        # refSseq refSeqLink
7211        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7212        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7213
7214        # Assembly
7215        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7216
7217        # Genome
7218        genome_file = None
7219        if find_genome(databases_genome):
7220            genome_file = find_genome(databases_genome)
7221        else:
7222            genome_file = find_genome(
7223                genome_path=databases_genomes_folders, assembly=assembly
7224            )
7225        log.debug("Genome: " + str(genome_file))
7226
7227        # refSseq
7228        refseq_file = find_file_prefix(
7229            input_file=databases_refseq,
7230            prefix="ncbiRefSeq",
7231            folder=databases_refseq_folders,
7232            assembly=assembly,
7233        )
7234        log.debug("refSeq: " + str(refseq_file))
7235
7236        # refSeqLink
7237        refseqlink_file = find_file_prefix(
7238            input_file=databases_refseqlink,
7239            prefix="ncbiRefSeqLink",
7240            folder=databases_refseq_folders,
7241            assembly=assembly,
7242        )
7243        log.debug("refSeqLink: " + str(refseqlink_file))
7244
7245        # Threads
7246        if not threads:
7247            threads = self.get_threads()
7248        log.debug("Threads: " + str(threads))
7249
7250        # Variables
7251        table_variants = self.get_table_variants(clause="update")
7252
7253        # Get variants SNV and InDel only
7254        query_variants = f"""
7255            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7256            FROM {table_variants}
7257            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7258            """
7259        df_variants = self.get_query_to_df(query_variants)
7260
7261        # Added columns
7262        added_columns = []
7263
7264        # Add hgvs column in variants table
7265        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7266        added_column = self.add_column(
7267            table_variants, hgvs_column_name, "STRING", default_value=None
7268        )
7269        added_columns.append(added_column)
7270
7271        log.debug(f"refSeq loading...")
7272        # refSeq in duckDB
7273        refseq_table = get_refseq_table(
7274            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7275        )
7276        # Loading all refSeq in Dataframe
7277        refseq_query = f"""
7278            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7279            FROM {refseq_table}
7280            JOIN df_variants ON (
7281                {refseq_table}.chrom = df_variants.CHROM
7282                AND {refseq_table}.txStart<=df_variants.POS
7283                AND {refseq_table}.txEnd>=df_variants.POS
7284            )
7285        """
7286        refseq_df = self.conn.query(refseq_query).pl()
7287
7288        if refseqlink_file:
7289            log.debug(f"refSeqLink loading...")
7290            # refSeqLink in duckDB
7291            refseqlink_table = get_refseq_table(
7292                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7293            )
7294            # Loading all refSeqLink in Dataframe
7295            protacc_column = "protAcc_with_ver"
7296            mrnaacc_column = "mrnaAcc_with_ver"
7297            refseqlink_query = f"""
7298                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7299                FROM {refseqlink_table} 
7300                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7301                WHERE protAcc_without_ver IS NOT NULL
7302            """
7303            # Polars Dataframe
7304            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7305
7306        # Read RefSeq transcripts into a python dict/model.
7307        log.debug(f"Transcripts loading...")
7308        with tempfile.TemporaryDirectory() as tmpdir:
7309            transcripts_query = f"""
7310                COPY (
7311                    SELECT {refseq_table}.*
7312                    FROM {refseq_table}
7313                    JOIN df_variants ON (
7314                        {refseq_table}.chrom=df_variants.CHROM
7315                        AND {refseq_table}.txStart<=df_variants.POS
7316                        AND {refseq_table}.txEnd>=df_variants.POS
7317                    )
7318                )
7319                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7320            """
7321            self.conn.query(transcripts_query)
7322            with open(f"{tmpdir}/transcript.tsv") as infile:
7323                transcripts = read_transcripts(infile)
7324
7325        # Polars connexion
7326        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7327
7328        log.debug("Genome loading...")
7329        # Read genome sequence using pyfaidx.
7330        genome = Fasta(genome_file)
7331
7332        log.debug("Start annotation HGVS...")
7333
7334        # Create
7335        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7336        ddf = dd.from_pandas(df_variants, npartitions=threads)
7337
7338        # Use dask.dataframe.apply() to apply function on each partition
7339        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7340
7341        # Convert Dask DataFrame to Pandas Dataframe
7342        df = ddf.compute()
7343
7344        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7345        with tempfile.TemporaryDirectory() as tmpdir:
7346            df_parquet = os.path.join(tmpdir, "df.parquet")
7347            df.to_parquet(df_parquet)
7348
7349            # Update hgvs column
7350            update_variant_query = f"""
7351                UPDATE {table_variants}
7352                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7353                FROM read_parquet('{df_parquet}') as df
7354                WHERE variants."#CHROM" = df.CHROM
7355                AND variants.POS = df.POS
7356                AND variants.REF = df.REF
7357                AND variants.ALT = df.ALT
7358                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7359                """
7360            self.execute_query(update_variant_query)
7361
7362        # Update INFO column
7363        sql_query_update = f"""
7364            UPDATE {table_variants}
7365            SET INFO = 
7366                concat(
7367                    CASE 
7368                        WHEN INFO NOT IN ('','.')
7369                        THEN concat(INFO, ';')
7370                        ELSE ''
7371                    END,
7372                    'hgvs=',
7373                    {hgvs_column_name}
7374                )
7375            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7376            """
7377        self.execute_query(sql_query_update)
7378
7379        # Add header
7380        HGVS_INFOS = {
7381            "hgvs": {
7382                "ID": "hgvs",
7383                "Number": ".",
7384                "Type": "String",
7385                "Description": f"HGVS annotatation with HOWARD",
7386            }
7387        }
7388
7389        for field in HGVS_INFOS:
7390            field_ID = HGVS_INFOS[field]["ID"]
7391            field_description = HGVS_INFOS[field]["Description"]
7392            self.get_header().infos[field_ID] = vcf.parser._Info(
7393                field_ID,
7394                HGVS_INFOS[field]["Number"],
7395                HGVS_INFOS[field]["Type"],
7396                field_description,
7397                "unknown",
7398                "unknown",
7399                code_type_map[HGVS_INFOS[field]["Type"]],
7400            )
7401
7402        # Remove added columns
7403        for added_column in added_columns:
7404            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
7410    def get_operations_help(
7411        self, operations_config_dict: dict = {}, operations_config_file: str = None
7412    ) -> list:
7413
7414        # Init
7415        operations_help = []
7416
7417        # operations
7418        operations = self.get_config_json(
7419            name="calculations",
7420            config_dict=operations_config_dict,
7421            config_file=operations_config_file,
7422        )
7423        for op in operations:
7424            op_name = operations[op].get("name", op).upper()
7425            op_description = operations[op].get("description", op_name)
7426            op_available = operations[op].get("available", False)
7427            if op_available:
7428                operations_help.append(f"   {op_name}: {op_description}")
7429
7430        # Sort operations
7431        operations_help.sort()
7432
7433        # insert header
7434        operations_help.insert(0, "Available calculation operations:")
7435
7436        # Return
7437        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
7439    def calculation(
7440        self,
7441        operations: dict = {},
7442        operations_config_dict: dict = {},
7443        operations_config_file: str = None,
7444    ) -> None:
7445        """
7446        It takes a list of operations, and for each operation, it checks if it's a python or sql
7447        operation, and then calls the appropriate function
7448
7449        param json example:
7450            "calculation": {
7451                "NOMEN": {
7452                    "options": {
7453                        "hgvs_field": "hgvs"
7454                    },
7455                "middle" : null
7456            }
7457        """
7458
7459        # Param
7460        param = self.get_param()
7461
7462        # operations config
7463        operations_config = self.get_config_json(
7464            name="calculations",
7465            config_dict=operations_config_dict,
7466            config_file=operations_config_file,
7467        )
7468
7469        # Upper keys
7470        operations_config = {k.upper(): v for k, v in operations_config.items()}
7471
7472        # Calculations
7473
7474        # Operations from param
7475        operations = param.get("calculation", {}).get("calculations", operations)
7476
7477        # Quick calculation - add
7478        if param.get("calculations", None):
7479            calculations_list = [
7480                value for value in param.get("calculations", "").split(",")
7481            ]
7482            log.info(f"Quick Calculations:")
7483            for calculation_key in calculations_list:
7484                log.info(f"   {calculation_key}")
7485            for calculation_operation in calculations_list:
7486                if calculation_operation.upper() not in operations:
7487                    operations[calculation_operation.upper()] = {}
7488                    add_value_into_dict(
7489                        dict_tree=param,
7490                        sections=[
7491                            "calculation",
7492                            "calculations",
7493                            calculation_operation.upper(),
7494                        ],
7495                        value={},
7496                    )
7497
7498        # Operations for calculation
7499        if not operations:
7500            operations = param.get("calculation", {}).get("calculations", {})
7501
7502        if operations:
7503            log.info(f"Calculations...")
7504
7505        # For each operations
7506        for operation_name in operations:
7507            operation_name = operation_name.upper()
7508            if operation_name not in [""]:
7509                if operation_name in operations_config:
7510                    log.info(f"Calculation '{operation_name}'")
7511                    operation = operations_config[operation_name]
7512                    operation_type = operation.get("type", "sql")
7513                    if operation_type == "python":
7514                        self.calculation_process_function(
7515                            operation=operation, operation_name=operation_name
7516                        )
7517                    elif operation_type == "sql":
7518                        self.calculation_process_sql(
7519                            operation=operation, operation_name=operation_name
7520                        )
7521                    else:
7522                        log.error(
7523                            f"Operations config: Type '{operation_type}' NOT available"
7524                        )
7525                        raise ValueError(
7526                            f"Operations config: Type '{operation_type}' NOT available"
7527                        )
7528                else:
7529                    log.error(
7530                        f"Operations config: Calculation '{operation_name}' NOT available"
7531                    )
7532                    raise ValueError(
7533                        f"Operations config: Calculation '{operation_name}' NOT available"
7534                    )
7535
7536        # Explode INFOS fields into table fields
7537        if self.get_explode_infos():
7538            self.explode_infos(
7539                prefix=self.get_explode_infos_prefix(),
7540                fields=self.get_explode_infos_fields(),
7541                force=True,
7542            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
7544    def calculation_process_sql(
7545        self, operation: dict, operation_name: str = "unknown"
7546    ) -> None:
7547        """
7548        The `calculation_process_sql` function takes in a mathematical operation as a string and
7549        performs the operation, updating the specified table with the result.
7550
7551        :param operation: The `operation` parameter is a dictionary that contains information about the
7552        mathematical operation to be performed. It includes the following keys:
7553        :type operation: dict
7554        :param operation_name: The `operation_name` parameter is a string that represents the name of
7555        the mathematical operation being performed. It is used for logging and error handling purposes,
7556        defaults to unknown
7557        :type operation_name: str (optional)
7558        """
7559
7560        # table variants
7561        table_variants = self.get_table_variants(clause="alter")
7562
7563        # Operation infos
7564        operation_name = operation.get("name", "unknown")
7565        log.debug(f"process sql {operation_name}")
7566        output_column_name = operation.get("output_column_name", operation_name)
7567        output_column_type = operation.get("output_column_type", "String")
7568        prefix = operation.get("explode_infos_prefix", "")
7569        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7570        output_column_description = operation.get(
7571            "output_column_description", f"{operation_name} operation"
7572        )
7573        operation_query = operation.get("operation_query", None)
7574        if isinstance(operation_query, list):
7575            operation_query = " ".join(operation_query)
7576        operation_info_fields = operation.get("info_fields", [])
7577        operation_info_fields_check = operation.get("info_fields_check", False)
7578        operation_info = operation.get("operation_info", True)
7579
7580        if operation_query:
7581
7582            # Info fields check
7583            operation_info_fields_check_result = True
7584            if operation_info_fields_check:
7585                header_infos = self.get_header().infos
7586                for info_field in operation_info_fields:
7587                    operation_info_fields_check_result = (
7588                        operation_info_fields_check_result
7589                        and info_field in header_infos
7590                    )
7591
7592            # If info fields available
7593            if operation_info_fields_check_result:
7594
7595                # Added_columns
7596                added_columns = []
7597
7598                # Create VCF header field
7599                vcf_reader = self.get_header()
7600                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7601                    output_column_name,
7602                    ".",
7603                    output_column_type,
7604                    output_column_description,
7605                    "howard calculation",
7606                    "0",
7607                    self.code_type_map.get(output_column_type),
7608                )
7609
7610                # Explode infos if needed
7611                log.debug(f"calculation_process_sql prefix {prefix}")
7612                added_columns += self.explode_infos(
7613                    prefix=prefix,
7614                    fields=[output_column_name] + operation_info_fields,
7615                    force=True,
7616                )
7617
7618                # Create column
7619                added_column = self.add_column(
7620                    table_name=table_variants,
7621                    column_name=prefix + output_column_name,
7622                    column_type=output_column_type_sql,
7623                    default_value="null",
7624                )
7625                added_columns.append(added_column)
7626
7627                # Operation calculation
7628                try:
7629
7630                    # Query to update calculation column
7631                    sql_update = f"""
7632                        UPDATE {table_variants}
7633                        SET "{prefix}{output_column_name}" = ({operation_query})
7634                    """
7635                    self.conn.execute(sql_update)
7636
7637                    # Add to INFO
7638                    if operation_info:
7639                        sql_update_info = f"""
7640                            UPDATE {table_variants}
7641                            SET "INFO" =
7642                                concat(
7643                                    CASE
7644                                        WHEN "INFO" IS NOT NULL
7645                                        THEN concat("INFO", ';')
7646                                        ELSE ''
7647                                    END,
7648                                    '{output_column_name}=',
7649                                    "{prefix}{output_column_name}"
7650                                )
7651                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7652                        """
7653                        self.conn.execute(sql_update_info)
7654
7655                except:
7656                    log.error(
7657                        f"Operations config: Calculation '{operation_name}' query failed"
7658                    )
7659                    raise ValueError(
7660                        f"Operations config: Calculation '{operation_name}' query failed"
7661                    )
7662
7663                # Remove added columns
7664                for added_column in added_columns:
7665                    log.debug(f"added_column: {added_column}")
7666                    self.drop_column(column=added_column)
7667
7668            else:
7669                log.error(
7670                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7671                )
7672                raise ValueError(
7673                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7674                )
7675
7676        else:
7677            log.error(
7678                f"Operations config: Calculation '{operation_name}' query NOT defined"
7679            )
7680            raise ValueError(
7681                f"Operations config: Calculation '{operation_name}' query NOT defined"
7682            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
7684    def calculation_process_function(
7685        self, operation: dict, operation_name: str = "unknown"
7686    ) -> None:
7687        """
7688        The `calculation_process_function` takes in an operation dictionary and performs the specified
7689        function with the given parameters.
7690
7691        :param operation: The `operation` parameter is a dictionary that contains information about the
7692        operation to be performed. It has the following keys:
7693        :type operation: dict
7694        :param operation_name: The `operation_name` parameter is a string that represents the name of
7695        the operation being performed. It is used for logging purposes, defaults to unknown
7696        :type operation_name: str (optional)
7697        """
7698
7699        operation_name = operation["name"]
7700        log.debug(f"process sql {operation_name}")
7701        function_name = operation["function_name"]
7702        function_params = operation["function_params"]
7703        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
7705    def calculation_variant_id(self) -> None:
7706        """
7707        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7708        updates the INFO field of a variants table with the variant ID.
7709        """
7710
7711        # variant_id annotation field
7712        variant_id_tag = self.get_variant_id_column()
7713        added_columns = [variant_id_tag]
7714
7715        # variant_id hgvs tags"
7716        vcf_infos_tags = {
7717            variant_id_tag: "howard variant ID annotation",
7718        }
7719
7720        # Variants table
7721        table_variants = self.get_table_variants()
7722
7723        # Header
7724        vcf_reader = self.get_header()
7725
7726        # Add variant_id to header
7727        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7728            variant_id_tag,
7729            ".",
7730            "String",
7731            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7732            "howard calculation",
7733            "0",
7734            self.code_type_map.get("String"),
7735        )
7736
7737        # Update
7738        sql_update = f"""
7739            UPDATE {table_variants}
7740            SET "INFO" = 
7741                concat(
7742                    CASE
7743                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7744                        THEN ''
7745                        ELSE concat("INFO", ';')
7746                    END,
7747                    '{variant_id_tag}=',
7748                    "{variant_id_tag}"
7749                )
7750        """
7751        self.conn.execute(sql_update)
7752
7753        # Remove added columns
7754        for added_column in added_columns:
7755            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs(self) -> None:
7757    def calculation_extract_snpeff_hgvs(self) -> None:
7758        """
7759        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7760        annotation field in a VCF file and adds them as a new column in the variants table.
7761        """
7762
7763        # SnpEff annotation field
7764        snpeff_ann = "ANN"
7765
7766        # SnpEff annotation field
7767        snpeff_hgvs = "snpeff_hgvs"
7768
7769        # Snpeff hgvs tags
7770        vcf_infos_tags = {
7771            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7772        }
7773
7774        # Prefix
7775        prefix = self.get_explode_infos_prefix()
7776        if prefix:
7777            prefix = "INFO/"
7778
7779        # snpEff fields
7780        speff_ann_infos = prefix + snpeff_ann
7781        speff_hgvs_infos = prefix + snpeff_hgvs
7782
7783        # Variants table
7784        table_variants = self.get_table_variants()
7785
7786        # Header
7787        vcf_reader = self.get_header()
7788
7789        # Add columns
7790        added_columns = []
7791
7792        # Explode HGVS field in column
7793        added_columns += self.explode_infos(fields=[snpeff_ann])
7794
7795        if "ANN" in vcf_reader.infos:
7796
7797            log.debug(vcf_reader.infos["ANN"])
7798
7799            # Create variant id
7800            variant_id_column = self.get_variant_id_column()
7801            added_columns += [variant_id_column]
7802
7803            # Create dataframe
7804            dataframe_snpeff_hgvs = self.get_query_to_df(
7805                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
7806            )
7807
7808            # Create main NOMEN column
7809            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
7810                speff_ann_infos
7811            ].apply(lambda x: extract_snpeff_hgvs(str(x)))
7812
7813            # Add snpeff_hgvs to header
7814            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
7815                snpeff_hgvs,
7816                ".",
7817                "String",
7818                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
7819                "howard calculation",
7820                "0",
7821                self.code_type_map.get("String"),
7822            )
7823
7824            # Update
7825            sql_update = f"""
7826                UPDATE variants
7827                SET "INFO" = 
7828                    concat(
7829                        CASE
7830                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7831                            THEN ''
7832                            ELSE concat("INFO", ';')
7833                        END,
7834                        CASE 
7835                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
7836                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
7837                            THEN concat(
7838                                    '{snpeff_hgvs}=',
7839                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
7840                                )
7841                            ELSE ''
7842                        END
7843                    )
7844                FROM dataframe_snpeff_hgvs
7845                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
7846
7847            """
7848            self.conn.execute(sql_update)
7849
7850            # Delete dataframe
7851            del dataframe_snpeff_hgvs
7852            gc.collect()
7853
7854        else:
7855
7856            log.warning(
7857                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
7858            )
7859
7860        # Remove added columns
7861        for added_column in added_columns:
7862            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

def calculation_extract_nomen(self) -> None:
7864    def calculation_extract_nomen(self) -> None:
7865        """
7866        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
7867        """
7868
7869        # NOMEN field
7870        field_nomen_dict = "NOMEN_DICT"
7871
7872        # NOMEN structure
7873        nomen_dict = {
7874            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
7875            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
7876            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
7877            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
7878            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
7879            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
7880            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
7881            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
7882            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
7883            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
7884        }
7885
7886        # Param
7887        param = self.get_param()
7888
7889        # Prefix
7890        prefix = self.get_explode_infos_prefix()
7891
7892        # Header
7893        vcf_reader = self.get_header()
7894
7895        # Get HGVS field
7896        hgvs_field = (
7897            param.get("calculation", {})
7898            .get("calculations", {})
7899            .get("NOMEN", {})
7900            .get("options", {})
7901            .get("hgvs_field", "hgvs")
7902        )
7903
7904        # Get transcripts
7905        transcripts_file = (
7906            param.get("calculation", {})
7907            .get("calculations", {})
7908            .get("NOMEN", {})
7909            .get("options", {})
7910            .get("transcripts", None)
7911        )
7912        transcripts_file = full_path(transcripts_file)
7913        transcripts = []
7914        if transcripts_file:
7915            if os.path.exists(transcripts_file):
7916                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
7917                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
7918            else:
7919                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
7920                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
7921
7922        # Added columns
7923        added_columns = []
7924
7925        # Explode HGVS field in column
7926        added_columns += self.explode_infos(fields=[hgvs_field])
7927
7928        # extra infos
7929        extra_infos = self.get_extra_infos()
7930        extra_field = prefix + hgvs_field
7931
7932        if extra_field in extra_infos:
7933
7934            # Create dataframe
7935            dataframe_hgvs = self.get_query_to_df(
7936                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
7937            )
7938
7939            # Create main NOMEN column
7940            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
7941                lambda x: find_nomen(str(x), transcripts=transcripts)
7942            )
7943
7944            # Explode NOMEN Structure and create SQL set for update
7945            sql_nomen_fields = []
7946            for nomen_field in nomen_dict:
7947
7948                # Explode each field into a column
7949                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
7950                    lambda x: dict(x).get(nomen_field, "")
7951                )
7952
7953                # Create VCF header field
7954                vcf_reader.infos[nomen_field] = vcf.parser._Info(
7955                    nomen_field,
7956                    ".",
7957                    "String",
7958                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
7959                    "howard calculation",
7960                    "0",
7961                    self.code_type_map.get("String"),
7962                )
7963                sql_nomen_fields.append(
7964                    f"""
7965                        CASE 
7966                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
7967                            THEN concat(
7968                                    ';{nomen_field}=',
7969                                    dataframe_hgvs."{nomen_field}"
7970                                )
7971                            ELSE ''
7972                        END
7973                    """
7974                )
7975
7976            # SQL set for update
7977            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
7978
7979            # Update
7980            sql_update = f"""
7981                UPDATE variants
7982                SET "INFO" = 
7983                    concat(
7984                        CASE
7985                            WHEN "INFO" IS NULL
7986                            THEN ''
7987                            ELSE "INFO"
7988                        END,
7989                        {sql_nomen_fields_set}
7990                    )
7991                FROM dataframe_hgvs
7992                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
7993                    AND variants."POS" = dataframe_hgvs."POS" 
7994                    AND variants."REF" = dataframe_hgvs."REF"
7995                    AND variants."ALT" = dataframe_hgvs."ALT"
7996            """
7997            self.conn.execute(sql_update)
7998
7999            # Delete dataframe
8000            del dataframe_hgvs
8001            gc.collect()
8002
8003        # Remove added columns
8004        for added_column in added_columns:
8005            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
8007    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8008        """
8009        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8010        pipeline/sample for a variant and updates the variant information in a VCF file.
8011
8012        :param tag: The `tag` parameter is a string that represents the annotation field for the
8013        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8014        VCF header and to update the corresponding field in the variants table, defaults to
8015        findbypipeline
8016        :type tag: str (optional)
8017        """
8018
8019        # if FORMAT and samples
8020        if (
8021            "FORMAT" in self.get_header_columns_as_list()
8022            and self.get_header_sample_list()
8023        ):
8024
8025            # findbypipeline annotation field
8026            findbypipeline_tag = tag
8027
8028            # VCF infos tags
8029            vcf_infos_tags = {
8030                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8031            }
8032
8033            # Prefix
8034            prefix = self.get_explode_infos_prefix()
8035
8036            # Field
8037            findbypipeline_infos = prefix + findbypipeline_tag
8038
8039            # Variants table
8040            table_variants = self.get_table_variants()
8041
8042            # Header
8043            vcf_reader = self.get_header()
8044
8045            # Create variant id
8046            variant_id_column = self.get_variant_id_column()
8047            added_columns = [variant_id_column]
8048
8049            # variant_id, FORMAT and samples
8050            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8051                self.get_header_sample_list()
8052            )
8053
8054            # Create dataframe
8055            dataframe_findbypipeline = self.get_query_to_df(
8056                f""" SELECT {samples_fields} FROM {table_variants} """
8057            )
8058
8059            # Create findbypipeline column
8060            dataframe_findbypipeline[findbypipeline_infos] = (
8061                dataframe_findbypipeline.apply(
8062                    lambda row: findbypipeline(
8063                        row, samples=self.get_header_sample_list()
8064                    ),
8065                    axis=1,
8066                )
8067            )
8068
8069            # Add snpeff_hgvs to header
8070            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8071                findbypipeline_tag,
8072                ".",
8073                "String",
8074                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8075                "howard calculation",
8076                "0",
8077                self.code_type_map.get("String"),
8078            )
8079
8080            # Update
8081            sql_update = f"""
8082                UPDATE variants
8083                SET "INFO" = 
8084                    concat(
8085                        CASE
8086                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8087                            THEN ''
8088                            ELSE concat("INFO", ';')
8089                        END,
8090                        CASE 
8091                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8092                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8093                            THEN concat(
8094                                    '{findbypipeline_tag}=',
8095                                    dataframe_findbypipeline."{findbypipeline_infos}"
8096                                )
8097                            ELSE ''
8098                        END
8099                    )
8100                FROM dataframe_findbypipeline
8101                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8102            """
8103            self.conn.execute(sql_update)
8104
8105            # Remove added columns
8106            for added_column in added_columns:
8107                self.drop_column(column=added_column)
8108
8109            # Delete dataframe
8110            del dataframe_findbypipeline
8111            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
8113    def calculation_genotype_concordance(self) -> None:
8114        """
8115        The function `calculation_genotype_concordance` calculates the genotype concordance for
8116        multi-caller VCF files and updates the variant information in the database.
8117        """
8118
8119        # if FORMAT and samples
8120        if (
8121            "FORMAT" in self.get_header_columns_as_list()
8122            and self.get_header_sample_list()
8123        ):
8124
8125            # genotypeconcordance annotation field
8126            genotypeconcordance_tag = "genotypeconcordance"
8127
8128            # VCF infos tags
8129            vcf_infos_tags = {
8130                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8131            }
8132
8133            # Prefix
8134            prefix = self.get_explode_infos_prefix()
8135
8136            # Field
8137            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8138
8139            # Variants table
8140            table_variants = self.get_table_variants()
8141
8142            # Header
8143            vcf_reader = self.get_header()
8144
8145            # Create variant id
8146            variant_id_column = self.get_variant_id_column()
8147            added_columns = [variant_id_column]
8148
8149            # variant_id, FORMAT and samples
8150            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8151                self.get_header_sample_list()
8152            )
8153
8154            # Create dataframe
8155            dataframe_genotypeconcordance = self.get_query_to_df(
8156                f""" SELECT {samples_fields} FROM {table_variants} """
8157            )
8158
8159            # Create genotypeconcordance column
8160            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8161                dataframe_genotypeconcordance.apply(
8162                    lambda row: genotypeconcordance(
8163                        row, samples=self.get_header_sample_list()
8164                    ),
8165                    axis=1,
8166                )
8167            )
8168
8169            # Add genotypeconcordance to header
8170            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8171                genotypeconcordance_tag,
8172                ".",
8173                "String",
8174                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8175                "howard calculation",
8176                "0",
8177                self.code_type_map.get("String"),
8178            )
8179
8180            # Update
8181            sql_update = f"""
8182                UPDATE variants
8183                SET "INFO" = 
8184                    concat(
8185                        CASE
8186                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8187                            THEN ''
8188                            ELSE concat("INFO", ';')
8189                        END,
8190                        CASE
8191                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8192                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8193                            THEN concat(
8194                                    '{genotypeconcordance_tag}=',
8195                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8196                                )
8197                            ELSE ''
8198                        END
8199                    )
8200                FROM dataframe_genotypeconcordance
8201                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8202            """
8203            self.conn.execute(sql_update)
8204
8205            # Remove added columns
8206            for added_column in added_columns:
8207                self.drop_column(column=added_column)
8208
8209            # Delete dataframe
8210            del dataframe_genotypeconcordance
8211            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
8213    def calculation_barcode(self, tag: str = "barcode") -> None:
8214        """
8215        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8216        updates the INFO field in the file with the calculated barcode values.
8217        """
8218
8219        # if FORMAT and samples
8220        if (
8221            "FORMAT" in self.get_header_columns_as_list()
8222            and self.get_header_sample_list()
8223        ):
8224
8225            # barcode annotation field
8226            if not tag:
8227                tag = "barcode"
8228
8229            # VCF infos tags
8230            vcf_infos_tags = {
8231                tag: "barcode calculation (VaRank)",
8232            }
8233
8234            # Prefix
8235            prefix = self.get_explode_infos_prefix()
8236
8237            # Field
8238            barcode_infos = prefix + tag
8239
8240            # Variants table
8241            table_variants = self.get_table_variants()
8242
8243            # Header
8244            vcf_reader = self.get_header()
8245
8246            # Create variant id
8247            variant_id_column = self.get_variant_id_column()
8248            added_columns = [variant_id_column]
8249
8250            # variant_id, FORMAT and samples
8251            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8252                self.get_header_sample_list()
8253            )
8254
8255            # Create dataframe
8256            dataframe_barcode = self.get_query_to_df(
8257                f""" SELECT {samples_fields} FROM {table_variants} """
8258            )
8259
8260            # Create barcode column
8261            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8262                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8263            )
8264
8265            # Add barcode to header
8266            vcf_reader.infos[tag] = vcf.parser._Info(
8267                tag,
8268                ".",
8269                "String",
8270                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8271                "howard calculation",
8272                "0",
8273                self.code_type_map.get("String"),
8274            )
8275
8276            # Update
8277            sql_update = f"""
8278                UPDATE {table_variants}
8279                SET "INFO" = 
8280                    concat(
8281                        CASE
8282                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8283                            THEN ''
8284                            ELSE concat("INFO", ';')
8285                        END,
8286                        CASE
8287                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8288                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8289                            THEN concat(
8290                                    '{tag}=',
8291                                    dataframe_barcode."{barcode_infos}"
8292                                )
8293                            ELSE ''
8294                        END
8295                    )
8296                FROM dataframe_barcode
8297                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8298            """
8299            self.conn.execute(sql_update)
8300
8301            # Remove added columns
8302            for added_column in added_columns:
8303                self.drop_column(column=added_column)
8304
8305            # Delete dataframe
8306            del dataframe_barcode
8307            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

def calculation_barcode_family(self, tag: str = 'BCF') -> None:
8309    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8310        """
8311        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8312        and updates the INFO field in the file with the calculated barcode values.
8313
8314        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8315        the barcode tag that will be added to the VCF file during the calculation process. If no value
8316        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8317        :type tag: str (optional)
8318        """
8319
8320        # if FORMAT and samples
8321        if (
8322            "FORMAT" in self.get_header_columns_as_list()
8323            and self.get_header_sample_list()
8324        ):
8325
8326            # barcode annotation field
8327            if not tag:
8328                tag = "BCF"
8329
8330            # VCF infos tags
8331            vcf_infos_tags = {
8332                tag: "barcode family calculation",
8333                f"{tag}S": "barcode family samples",
8334            }
8335
8336            # Param
8337            param = self.get_param()
8338            log.debug(f"param={param}")
8339
8340            # Prefix
8341            prefix = self.get_explode_infos_prefix()
8342
8343            # PED param
8344            ped = (
8345                param.get("calculation", {})
8346                .get("calculations", {})
8347                .get("BARCODEFAMILY", {})
8348                .get("family_pedigree", None)
8349            )
8350            log.debug(f"ped={ped}")
8351
8352            # Load PED
8353            if ped:
8354
8355                # Pedigree is a file
8356                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8357                    log.debug("Pedigree is file")
8358                    with open(full_path(ped)) as ped:
8359                        ped = json.load(ped)
8360
8361                # Pedigree is a string
8362                elif isinstance(ped, str):
8363                    log.debug("Pedigree is str")
8364                    try:
8365                        ped = json.loads(ped)
8366                        log.debug("Pedigree is json str")
8367                    except ValueError as e:
8368                        ped_samples = ped.split(",")
8369                        ped = {}
8370                        for ped_sample in ped_samples:
8371                            ped[ped_sample] = ped_sample
8372
8373                # Pedigree is a dict
8374                elif isinstance(ped, dict):
8375                    log.debug("Pedigree is dict")
8376
8377                # Pedigree is not well formatted
8378                else:
8379                    msg_error = "Pedigree not well formatted"
8380                    log.error(msg_error)
8381                    raise ValueError(msg_error)
8382
8383                # Construct list
8384                ped_samples = list(ped.values())
8385
8386            else:
8387                log.debug("Pedigree not defined. Take all samples")
8388                ped_samples = self.get_header_sample_list()
8389                ped = {}
8390                for ped_sample in ped_samples:
8391                    ped[ped_sample] = ped_sample
8392
8393            # Check pedigree
8394            if not ped or len(ped) == 0:
8395                msg_error = f"Error in pedigree: samples {ped_samples}"
8396                log.error(msg_error)
8397                raise ValueError(msg_error)
8398
8399            # Log
8400            log.info(
8401                "Calculation 'BARCODEFAMILY' - Samples: "
8402                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8403            )
8404            log.debug(f"ped_samples={ped_samples}")
8405
8406            # Field
8407            barcode_infos = prefix + tag
8408
8409            # Variants table
8410            table_variants = self.get_table_variants()
8411
8412            # Header
8413            vcf_reader = self.get_header()
8414
8415            # Create variant id
8416            variant_id_column = self.get_variant_id_column()
8417            added_columns = [variant_id_column]
8418
8419            # variant_id, FORMAT and samples
8420            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8421                ped_samples
8422            )
8423
8424            # Create dataframe
8425            dataframe_barcode = self.get_query_to_df(
8426                f""" SELECT {samples_fields} FROM {table_variants} """
8427            )
8428
8429            # Create barcode column
8430            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8431                lambda row: barcode(row, samples=ped_samples), axis=1
8432            )
8433
8434            # Add barcode family to header
8435            # Add vaf_normalization to header
8436            vcf_reader.formats[tag] = vcf.parser._Format(
8437                id=tag,
8438                num=".",
8439                type="String",
8440                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8441                type_code=self.code_type_map.get("String"),
8442            )
8443            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8444                id=f"{tag}S",
8445                num=".",
8446                type="String",
8447                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8448                type_code=self.code_type_map.get("String"),
8449            )
8450
8451            # Update
8452            # for sample in ped_samples:
8453            sql_update_set = []
8454            for sample in self.get_header_sample_list() + ["FORMAT"]:
8455                if sample in ped_samples:
8456                    value = f'dataframe_barcode."{barcode_infos}"'
8457                    value_samples = "'" + ",".join(ped_samples) + "'"
8458                elif sample == "FORMAT":
8459                    value = f"'{tag}'"
8460                    value_samples = f"'{tag}S'"
8461                else:
8462                    value = "'.'"
8463                    value_samples = "'.'"
8464                format_regex = r"[a-zA-Z0-9\s]"
8465                sql_update_set.append(
8466                    f"""
8467                        "{sample}" = 
8468                        concat(
8469                            CASE
8470                                WHEN {table_variants}."{sample}" = './.'
8471                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8472                                ELSE {table_variants}."{sample}"
8473                            END,
8474                            ':',
8475                            {value},
8476                            ':',
8477                            {value_samples}
8478                        )
8479                    """
8480                )
8481
8482            sql_update_set_join = ", ".join(sql_update_set)
8483            sql_update = f"""
8484                UPDATE {table_variants}
8485                SET {sql_update_set_join}
8486                FROM dataframe_barcode
8487                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8488            """
8489            self.conn.execute(sql_update)
8490
8491            # Remove added columns
8492            for added_column in added_columns:
8493                self.drop_column(column=added_column)
8494
8495            # Delete dataframe
8496            del dataframe_barcode
8497            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
8499    def calculation_trio(self) -> None:
8500        """
8501        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8502        information to the INFO field of each variant.
8503        """
8504
8505        # if FORMAT and samples
8506        if (
8507            "FORMAT" in self.get_header_columns_as_list()
8508            and self.get_header_sample_list()
8509        ):
8510
8511            # trio annotation field
8512            trio_tag = "trio"
8513
8514            # VCF infos tags
8515            vcf_infos_tags = {
8516                "trio": "trio calculation",
8517            }
8518
8519            # Param
8520            param = self.get_param()
8521
8522            # Prefix
8523            prefix = self.get_explode_infos_prefix()
8524
8525            # Trio param
8526            trio_ped = (
8527                param.get("calculation", {})
8528                .get("calculations", {})
8529                .get("TRIO", {})
8530                .get("trio_pedigree", None)
8531            )
8532
8533            # Load trio
8534            if trio_ped:
8535
8536                # Trio pedigree is a file
8537                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8538                    log.debug("TRIO pedigree is file")
8539                    with open(full_path(trio_ped)) as trio_ped:
8540                        trio_ped = json.load(trio_ped)
8541
8542                # Trio pedigree is a string
8543                elif isinstance(trio_ped, str):
8544                    log.debug("TRIO pedigree is str")
8545                    try:
8546                        trio_ped = json.loads(trio_ped)
8547                        log.debug("TRIO pedigree is json str")
8548                    except ValueError as e:
8549                        trio_samples = trio_ped.split(",")
8550                        if len(trio_samples) == 3:
8551                            trio_ped = {
8552                                "father": trio_samples[0],
8553                                "mother": trio_samples[1],
8554                                "child": trio_samples[2],
8555                            }
8556                            log.debug("TRIO pedigree is list str")
8557                        else:
8558                            msg_error = "TRIO pedigree not well formatted"
8559                            log.error(msg_error)
8560                            raise ValueError(msg_error)
8561
8562                # Trio pedigree is a dict
8563                elif isinstance(trio_ped, dict):
8564                    log.debug("TRIO pedigree is dict")
8565
8566                # Trio pedigree is not well formatted
8567                else:
8568                    msg_error = "TRIO pedigree not well formatted"
8569                    log.error(msg_error)
8570                    raise ValueError(msg_error)
8571
8572                # Construct trio list
8573                trio_samples = [
8574                    trio_ped.get("father", ""),
8575                    trio_ped.get("mother", ""),
8576                    trio_ped.get("child", ""),
8577                ]
8578
8579            else:
8580                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8581                samples_list = self.get_header_sample_list()
8582                if len(samples_list) >= 3:
8583                    trio_samples = self.get_header_sample_list()[0:3]
8584                    trio_ped = {
8585                        "father": trio_samples[0],
8586                        "mother": trio_samples[1],
8587                        "child": trio_samples[2],
8588                    }
8589                else:
8590                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8591                    log.error(msg_error)
8592                    raise ValueError(msg_error)
8593
8594            # Check trio pedigree
8595            if not trio_ped or len(trio_ped) != 3:
8596                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8597                log.error(msg_error)
8598                raise ValueError(msg_error)
8599
8600            # Log
8601            log.info(
8602                f"Calculation 'TRIO' - Samples: "
8603                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
8604            )
8605
8606            # Field
8607            trio_infos = prefix + trio_tag
8608
8609            # Variants table
8610            table_variants = self.get_table_variants()
8611
8612            # Header
8613            vcf_reader = self.get_header()
8614
8615            # Create variant id
8616            variant_id_column = self.get_variant_id_column()
8617            added_columns = [variant_id_column]
8618
8619            # variant_id, FORMAT and samples
8620            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8621                self.get_header_sample_list()
8622            )
8623
8624            # Create dataframe
8625            dataframe_trio = self.get_query_to_df(
8626                f""" SELECT {samples_fields} FROM {table_variants} """
8627            )
8628
8629            # Create trio column
8630            dataframe_trio[trio_infos] = dataframe_trio.apply(
8631                lambda row: trio(row, samples=trio_samples), axis=1
8632            )
8633
8634            # Add trio to header
8635            vcf_reader.infos[trio_tag] = vcf.parser._Info(
8636                trio_tag,
8637                ".",
8638                "String",
8639                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
8640                "howard calculation",
8641                "0",
8642                self.code_type_map.get("String"),
8643            )
8644
8645            # Update
8646            sql_update = f"""
8647                UPDATE {table_variants}
8648                SET "INFO" = 
8649                    concat(
8650                        CASE
8651                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8652                            THEN ''
8653                            ELSE concat("INFO", ';')
8654                        END,
8655                        CASE
8656                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
8657                             AND dataframe_trio."{trio_infos}" NOT NULL
8658                            THEN concat(
8659                                    '{trio_tag}=',
8660                                    dataframe_trio."{trio_infos}"
8661                                )
8662                            ELSE ''
8663                        END
8664                    )
8665                FROM dataframe_trio
8666                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
8667            """
8668            self.conn.execute(sql_update)
8669
8670            # Remove added columns
8671            for added_column in added_columns:
8672                self.drop_column(column=added_column)
8673
8674            # Delete dataframe
8675            del dataframe_trio
8676            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
8678    def calculation_vaf_normalization(self) -> None:
8679        """
8680        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
8681        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
8682        :return: The function does not return anything.
8683        """
8684
8685        # if FORMAT and samples
8686        if (
8687            "FORMAT" in self.get_header_columns_as_list()
8688            and self.get_header_sample_list()
8689        ):
8690
8691            # vaf_normalization annotation field
8692            vaf_normalization_tag = "VAF"
8693
8694            # VCF infos tags
8695            vcf_infos_tags = {
8696                "VAF": "VAF Variant Frequency",
8697            }
8698
8699            # Prefix
8700            prefix = self.get_explode_infos_prefix()
8701
8702            # Variants table
8703            table_variants = self.get_table_variants()
8704
8705            # Header
8706            vcf_reader = self.get_header()
8707
8708            # Do not calculate if VAF already exists
8709            if "VAF" in vcf_reader.formats:
8710                log.debug("VAF already on genotypes")
8711                return
8712
8713            # Create variant id
8714            variant_id_column = self.get_variant_id_column()
8715            added_columns = [variant_id_column]
8716
8717            # variant_id, FORMAT and samples
8718            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8719                self.get_header_sample_list()
8720            )
8721
8722            # Create dataframe
8723            dataframe_vaf_normalization = self.get_query_to_df(
8724                f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
8725            )
8726
8727            vaf_normalization_set = []
8728
8729            # for each sample vaf_normalization
8730            for sample in self.get_header_sample_list():
8731                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
8732                    lambda row: vaf_normalization(row, sample=sample), axis=1
8733                )
8734                vaf_normalization_set.append(
8735                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
8736                )
8737
8738            # Add VAF to FORMAT
8739            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
8740                "FORMAT"
8741            ].apply(lambda x: str(x) + ":VAF")
8742            vaf_normalization_set.append(
8743                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
8744            )
8745
8746            # Add vaf_normalization to header
8747            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
8748                id=vaf_normalization_tag,
8749                num="1",
8750                type="Float",
8751                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
8752                type_code=self.code_type_map.get("Float"),
8753            )
8754
8755            # Create fields to add in INFO
8756            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
8757
8758            # Update
8759            sql_update = f"""
8760                UPDATE {table_variants}
8761                SET {sql_vaf_normalization_set}
8762                FROM dataframe_vaf_normalization
8763                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
8764
8765            """
8766            self.conn.execute(sql_update)
8767
8768            # Remove added columns
8769            for added_column in added_columns:
8770                self.drop_column(column=added_column)
8771
8772            # Delete dataframe
8773            del dataframe_vaf_normalization
8774            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
8776    def calculation_genotype_stats(self, info: str = "VAF") -> None:
8777        """
8778        The `calculation_genotype_stats` function calculates genotype statistics for a given information
8779        field in a VCF file and updates the INFO column of the variants table with the calculated
8780        statistics.
8781
8782        :param info: The `info` parameter is a string that represents the type of information for which
8783        genotype statistics are calculated. It is used to generate various VCF info tags for the
8784        statistics, such as the number of occurrences, the list of values, the minimum value, the
8785        maximum value, the mean, the median, defaults to VAF
8786        :type info: str (optional)
8787        """
8788
8789        # if FORMAT and samples
8790        if (
8791            "FORMAT" in self.get_header_columns_as_list()
8792            and self.get_header_sample_list()
8793        ):
8794
8795            # vaf_stats annotation field
8796            vaf_stats_tag = info + "_stats"
8797
8798            # VCF infos tags
8799            vcf_infos_tags = {
8800                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
8801                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
8802                info + "_stats_min": f"genotype {info} Statistics - min {info}",
8803                info + "_stats_max": f"genotype {info} Statistics - max {info}",
8804                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
8805                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
8806                info
8807                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
8808            }
8809
8810            # Prefix
8811            prefix = self.get_explode_infos_prefix()
8812
8813            # Field
8814            vaf_stats_infos = prefix + vaf_stats_tag
8815
8816            # Variants table
8817            table_variants = self.get_table_variants()
8818
8819            # Header
8820            vcf_reader = self.get_header()
8821
8822            # Create variant id
8823            variant_id_column = self.get_variant_id_column()
8824            added_columns = [variant_id_column]
8825
8826            # variant_id, FORMAT and samples
8827            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8828                self.get_header_sample_list()
8829            )
8830
8831            # Create dataframe
8832            dataframe_vaf_stats = self.get_query_to_df(
8833                f""" SELECT {samples_fields} FROM {table_variants} """
8834            )
8835
8836            # Create vaf_stats column
8837            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
8838                lambda row: genotype_stats(
8839                    row, samples=self.get_header_sample_list(), info=info
8840                ),
8841                axis=1,
8842            )
8843
8844            # List of vcf tags
8845            sql_vaf_stats_fields = []
8846
8847            # Check all VAF stats infos
8848            for stat in vcf_infos_tags:
8849
8850                # Extract stats
8851                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
8852                    lambda x: dict(x).get(stat, "")
8853                )
8854
8855                # Add snpeff_hgvs to header
8856                vcf_reader.infos[stat] = vcf.parser._Info(
8857                    stat,
8858                    ".",
8859                    "String",
8860                    vcf_infos_tags.get(stat, "genotype statistics"),
8861                    "howard calculation",
8862                    "0",
8863                    self.code_type_map.get("String"),
8864                )
8865
8866                if len(sql_vaf_stats_fields):
8867                    sep = ";"
8868                else:
8869                    sep = ""
8870
8871                # Create fields to add in INFO
8872                sql_vaf_stats_fields.append(
8873                    f"""
8874                        CASE
8875                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
8876                            THEN concat(
8877                                    '{sep}{stat}=',
8878                                    dataframe_vaf_stats."{stat}"
8879                                )
8880                            ELSE ''
8881                        END
8882                    """
8883                )
8884
8885            # SQL set for update
8886            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
8887
8888            # Update
8889            sql_update = f"""
8890                UPDATE variants
8891                SET "INFO" = 
8892                    concat(
8893                        CASE
8894                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8895                            THEN ''
8896                            ELSE concat("INFO", ';')
8897                        END,
8898                        {sql_vaf_stats_fields_set}
8899                    )
8900                FROM dataframe_vaf_stats
8901                WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
8902
8903            """
8904            self.conn.execute(sql_update)
8905
8906            # Remove added columns
8907            for added_column in added_columns:
8908                self.drop_column(column=added_column)
8909
8910            # Delete dataframe
8911            del dataframe_vaf_stats
8912            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF